[llvm-branch-commits] [llvm] DAG: Lower is.fpclass fcSubnormal|fcZero to fabs(x) < smallest_normal (PR #100390)

Fri Jul 26 04:55:40 PDT 2024

Andrzej =?utf-8?q?Warzyński?= <andrzej.warzynski at arm.com>,Jan Leyonberg
 <jan_sjodin at yahoo.com>,OverMighty <its.overmighty at gmail.com>,Utkarsh Saxena
 <usx at google.com>,Amir Ayupov <aaupov at fb.com>,Amir Ayupov <aaupov at fb.com>,Carlos
 Seo <carlos.seo at linaro.org>,"Mikhail R. Gadelha" <mikhail at igalia.com>,Amir
 Ayupov <aaupov at fb.com>,Mitch Phillips <mitchp at google.com>,Mitch Phillips
 <mitchp at google.com>,Angel Zhang <angel.zhang at amd.com>,Mitch Phillips
 <mitchp at google.com>,Mitch Phillips <mitchp at google.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,Simon Pilgrim
 <llvm-dev at redking.me.uk>,Joseph Huber <huberjn at outlook.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,Joseph Huber
 <huberjn at outlook.com>,cor3ntin <corentinjabot at gmail.com>,Kiran Chandramohan
 <kiran.chandramohan at arm.com>,cor3ntin <corentinjabot at gmail.com>,Ramkumar
 Ramachandra <ramkumar.ramachandra at codasip.com>,Ramkumar Ramachandra
 <ramkumar.ramachandra at codasip.com>,Yingwei Zheng <dtcxzyw2333 at gmail.com>,Krzysztof
 Pszeniczny <kpszeniczny at google.com>,Evgenii Kudriashov
 <evgenii.kudriashov at intel.com>,Ramkumar Ramachandra
 <ramkumar.ramachandra at codasip.com>,Dmitry Chernenkov <dmitryc at google.com>,Joseph
 Huber <huberjn at outlook.com>,Simon Pilgrim <llvm-dev at redking.me.uk>,Krystian
 Stasiowski <sdkrystian at gmail.com>,Louis Dionne <ldionne.2 at gmail.com>,Angel
 Zhang <angel.zhang at amd.com>,Finlay <finlay.marno at codeplay.com>,Johannes
 Doerfert <johannes at jdoerfert.de>,memory-thrasher
 <39869500+memory-thrasher at users.noreply.github.com>,Philip Reames
 <preames at rivosinc.com>,Aaron Ballman <aaron at aaronballman.com>,Philip Reames
 <preames at rivosinc.com>,Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,Sam
 Elliott <quic_aelliott at quicinc.com>,Andy Kaylor <andrew.kaylor at intel.com>,Jonathan
 Peyton <jonathan.l.peyton at intel.com>,smanna12 <soumi.manna at intel.com>,Mark
 de Wever <koraq at xs4all.nl>,Joseph Huber <huberjn at outlook.com>,gonzalobg
 <65027571+gonzalobg at users.noreply.github.com>,Fangrui Song ,David
 Blaikie <dblaikie at gmail.com>,Krzysztof Parzyszek
 <Krzysztof.Parzyszek at amd.com>,Ramkumar Ramachandra
 <ramkumar.ramachandra at codasip.com>,gonzalobg
 <65027571+gonzalobg at users.noreply.github.com>,Brendan Dahl
 <brendan.dahl at gmail.com>,Craig Topper <craig.topper at sifive.com>,Gang Chen
 <gangc at amd.com>,Joseph Huber <huberjn at outlook.com>,Kevin McAfee
 <kmcafee at nvidia.com>,Craig Topper <craig.topper at sifive.com>,Sumanth
 Gundapaneni <sumanth.gundapaneni at amd.com>,Eli Friedman <efriedma at quicinc.com>
 =?utf-8?q?,?=Jon Roelofs <jonathan_roelofs at apple.com>,
Thorsten =?utf-8?q?Schütt?= <schuett at gmail.com>,Narayan
 <32898329+vortex73 at users.noreply.github.com>,James Y Knight
 <jyknight at google.com>,Carlos Seo <carlos.seo at linaro.org>,Daniil Kovalev
 <dkovalev at accesssoftek.com>,David Blaikie <dblaikie at gmail.com>,aaryanshukla
 <53713108+aaryanshukla at users.noreply.github.com>,Jonas Devlieghere
 <jonas at devlieghere.com>,Leandro Lupori <leandro.lupori at linaro.org>,James Y
 Knight <jyknight at google.com>,Daniil Kovalev <dkovalev at accesssoftek.com>,Job
 Henandez Lara <hj93 at protonmail.com>,Jorge Gorbe Moya <jgorbe at google.com>,Heejin
 Ahn <aheejin at gmail.com>,Thurston Dang <thurston at google.com>,Dmitry Chestnykh
 <dm.chestnykh at gmail.com>,Jason Molenda <jmolenda at apple.com>,Mingming Liu
 <mingmingl at google.com>,Joseph Huber <huberjn at outlook.com>,Owen Pan
 <owenpiano at gmail.com>,Gedare Bloom <gedare at rtems.org>,Mingming Liu
 <mingmingl at google.com>,Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,Chen
 Zheng <czhengsz at cn.ibm.com>,Mariusz Sikora <mariusz.sikora at amd.com>,PeterChou1
 <peter.chou at mail.utoronto.ca>,Shivam Gupta <shivam98.tkg at gmail.com>,Freddy
 Ye <freddy.ye at intel.com>,Kazu Hirata <kazu at google.com>,Med Ismail Bennani
 <ismail at bennani.ma>,Fangrui Song ,Mel Chen
 <mel.chen at sifive.com>,Muhammad Omair Javaid <omair.javaid at linaro.org>,Antonio
 Frighetto <me at antoniofrighetto.com>,Antonio Frighetto
 <me at antoniofrighetto.com>,PeterChou1 <peter.chou at mail.utoronto.ca>,Hari
 Limaye <hari.limaye at arm.com>,David Spickett <david.spickett at linaro.org>,Med
 Ismail Bennani <ismail at bennani.ma>,Christian Ulmann
 <christianulmann at gmail.com>,Edd Dawson <edd.dawson at sony.com>,
Andrzej =?utf-8?q?Warzyński?= <andrzej.warzynski at arm.com>,
Andrzej =?utf-8?q?Warzyński?= <andrzej.warzynski at arm.com>,
kadir =?utf-8?q?çetinkaya?= <kadircet at google.com>,David Spickett
 <david.spickett at linaro.org>,
kadir =?utf-8?q?çetinkaya?= <kadircet at google.com>,Jay Foad
 <jay.foad at amd.com>,Matt Arsenault <Matthew.Arsenault at amd.com>,chuongg3
 <chuong.goh at arm.com>,Him188 <tguan at nvidia.com>,Florian Hahn <flo at fhahn.com>,Nikita
 Popov <npopov at redhat.com>,Shivam Gupta <shivam98.tkg at gmail.com>,Kai Yan
 <aklkaiyan at tencent.com>,Florian Hahn <flo at fhahn.com>,
=?utf-8?q?Kristóf?= Umann <dkszelethus at gmail.com>,Shivam Gupta
 <shivam98.tkg at gmail.com>,Balazs Benics <benicsbalazs at gmail.com>,yronglin
 <yronglin777 at gmail.com>,runseny <145632023+runseny at users.noreply.github.com>,Mehdi
 Amini <joker.eph at gmail.com>,"Oleksandr T." <oleksandr.tarasiuk at outlook.com>,David
 Spickett <david.spickett at linaro.org>,Matt Arsenault
 <Matthew.Arsenault at amd.com>,Abid Qadeer <haqadeer at amd.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,James Y Knight
 <jyknight at google.com>,Zibi Sarbinowski <zibi at ca.ibm.com>,Yingwei Zheng
 <dtcxzyw2333 at gmail.com>,Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,Ivan
 Kosarev <ivan.kosarev at amd.com>,Simon Pilgrim <llvm-dev at redking.me.uk>,Simon
 Pilgrim <llvm-dev at redking.me.uk>,Kai Nacke <kai.peter.nacke at ibm.com>,Chen
 Zheng <czhengsz at cn.ibm.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,weiwei chen
 <weiwei.chen at modular.com>,Chen Zheng <czhengsz at cn.ibm.com>,Krzysztof
 Parzyszek <Krzysztof.Parzyszek at amd.com>,Joseph Huber <huberjn at outlook.com>,Joseph
 Huber <huberjn at outlook.com>,Simon Pilgrim <llvm-dev at redking.me.uk>,Mike Rice
 <michael.p.rice at intel.com>,Tristan Ross <tristan.ross at midstall.com>,Tristan
 Ross <tristan.ross at midstall.com>,Nathan James <n.james93 at hotmail.co.uk>,Piotr
 Zegar <me at piotrzegar.pl>,Ivan Kosarev <ivan.kosarev at amd.com>,Tom Eccles
 <tom.eccles at arm.com>,Simon Pilgrim <llvm-dev at redking.me.uk>,Acim Maravic
 <Acim.Maravic at Syrmia.com>,jeanPerier <jperier at nvidia.com>,Vlad Serebrennikov
 <serebrennikov.vladislav at gmail.com>,
Mirko =?utf-8?q?Brkušanin?= <Mirko.Brkusanin at amd.com>,Mikhail R. Gadelha
 <mikhail at igalia.com>=?utf-8?q?,?=Noah Goldstein <goldstein.w.n at gmail.com>,Zahira
 Ammarguellat <zahira.ammarguellat at intel.com>,Mark de Wever <koraq at xs4all.nl>,Akash
 Banerjee <akash.banerjee at amd.com>,David Green <david.green at arm.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,Ramkumar Ramachandra
 <ramkumar.ramachandra at codasip.com>,Jon Roelofs <jonathan_roelofs at apple.com>,Benjamin
 Maxwell <benjamin.maxwell at arm.com>,Louis Dionne <ldionne.2 at gmail.com>,
Thorsten =?utf-8?q?Schütt?= <schuett at gmail.com>,Nico Weber
 <thakis at chromium.org>,Nico Weber <thakis at chromium.org>,Kazu Hirata
 <kazu at google.com>,Hari Limaye <hari.limaye at arm.com>,"Mikhail R. Gadelha"
 <mikhail at igalia.com>,Justin Bogner <mail at justinbogner.com>,Daniel Thornburgh
 <dthorn at google.com>,Vladislav Dzhidzhoev <vdzhidzhoev at accesssoftek.com>,Justin
 Bogner <mail at justinbogner.com>,Scott Todd <scott.todd0 at gmail.com>,Joseph
 Huber <huberjn at outlook.com>,Louis Dionne <ldionne.2 at gmail.com>,Louis Dionne
 <ldionne.2 at gmail.com>,LLVM GN Syncbot <llvmgnsyncbot at gmail.com>,Anton
 Korobeynikov <anton at korobeynikov.info>,OverMighty <its.overmighty at gmail.com>,Daniil
 Kovalev <dkovalev at accesssoftek.com>,Philip Reames <preames at rivosinc.com>,Justin
 Bogner <mail at justinbogner.com>,nicebert
 <110385235+nicebert at users.noreply.github.com>,Mircea Trofin
 <mtrofin at google.com>,Matthias Springer <me at m-sp.org>,Philip Reames
 <preames at rivosinc.com>,Joseph Huber <huberjn at outlook.com>,Mircea Trofin
 <mtrofin at google.com>,Tristan Ross <tristan.ross at midstall.com>,Med Ismail
 Bennani <ismail at bennani.ma>,Florian Hahn <flo at fhahn.com>,Daniel Thornburgh
 <dthorn at google.com>,Zhaoshi Zheng <zhaoshiz at quicinc.com>,Matt Arsenault
 <Matthew.Arsenault at amd.com>,David Green <david.green at arm.com>,Matt Arsenault
 <Matthew.Arsenault at amd.com>,Alex Langford <alangford at apple.com>,Fangrui Song
 ,Pranav Bhandarkar <pranav.bhandarkar at amd.com>,PiJoules
 <6019989+PiJoules at users.noreply.github.com>,David Blaikie
 <dblaikie at gmail.com>,Fangrui Song ,Cyndy Ishida
 <cyndy_ishida at apple.com>,Fangrui Song ,"Stephan T. Lavavej"
 <stl at nuwen.net>,Brad Smith <brad at comstyle.com>,Med Ismail Bennani
 <ismail at bennani.ma>,Amir Ayupov <aaupov at fb.com>,Fangrui Song ,Alexey
 Bataev <a.bataev at outlook.com>,Luke Lau <luke at igalia.com>,Med Ismail Bennani
 <ismail at bennani.ma>,Fangrui Song ,Alan Zhao <ayzhao at google.com>
 =?utf-8?q?,?=NAKAMURA Takumi <geek4civic at gmail.com>,Ding Fei
 <fding at feysh.com>,Emilio Cota <ecg at google.com>,Luke Lau <luke at igalia.com>,Emilio
 Cota <ecg at google.com>,Alexander Shaposhnikov <ashaposhnikov at google.com>,Matt
 Arsenault <Matthew.Arsenault at amd.com>,Matt Arsenault
 <Matthew.Arsenault at amd.com>,wanglei <wanglei at loongson.cn>,wanglei
 <wanglei at loongson.cn>,Fabian Ritter <fabian.ritter at amd.com>,R
 <rqou at berkeley.edu>,Vlad Serebrennikov <serebrennikov.vladislav at gmail.com>,Nathan
 James <n.james93 at hotmail.co.uk>,Edd Dawson <edd.dawson at sony.com>,Matt
 Arsenault <Matthew.Arsenault at amd.com>,serge-sans-paille
 <serge.guelton at telecom-bretagne.eu>,Changpeng Fang <changpeng.fang at amd.com>,Pengcheng
 Wang <wangpengcheng.pp at bytedance.com>,
=?utf-8?q?Balázs_Kéri?= <balazs.keri at ericsson.com>,Alexandros
 Lamprineas <alexandros.lamprineas at arm.com>,Dmitri Gribenko
 <gribozavr at gmail.com>,Mel Chen <mel.chen at sifive.com>,Matt Arsenault
 <Matthew.Arsenault at amd.com>,Matt Arsenault <Matthew.Arsenault at amd.com>,Matt
 Arsenault <Matthew.Arsenault at amd.com>,Ivan Kosarev <ivan.kosarev at amd.com>,Ricardo
 Jesus <rjj at nvidia.com>,Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,Vikash
 Gupta <Vikash.Gupta at amd.com>,Vlad Serebrennikov
 <serebrennikov.vladislav at gmail.com>,Florian Hahn <flo at fhahn.com>,Simon
 Pilgrim <llvm-dev at redking.me.uk>,yronglin <yronglin777 at gmail.com>,Matt
 Arsenault <Matthew.Arsenault at amd.com>
Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/100390 at github.com>


https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/100390

>From b9995a14f5dee5620163e83468d3be4fcd3e67d0 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 24 Jul 2024 14:27:12 +0100
Subject: [PATCH 001/259] [MachineLoopInfo] Fix assertion failure on undef use
 operands (#100137)

Fixes #100115
---
 llvm/lib/CodeGen/MachineLoopInfo.cpp | 2 +-
 llvm/test/CodeGen/AMDGPU/dpp64_combine.ll | 25 +++++-
 .../CodeGen/AMDGPU/machinelicm-undef-use.mir | 79 +++++++++++++++++++
 3 files changed, 103 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/machinelicm-undef-use.mir

diff --git a/llvm/lib/CodeGen/MachineLoopInfo.cpp b/llvm/lib/CodeGen/MachineLoopInfo.cpp
index a03c008e6045a..88ba10fbe29a9 100644
--- a/llvm/lib/CodeGen/MachineLoopInfo.cpp
+++ b/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -287,7 +287,7 @@ bool MachineLoop::isLoopInvariant(MachineInstr &I,
       }
     }
 
-    if (!MO.isUse())
+    if (!MO.readsReg())
       continue;
 
     assert(MRI->getVRegDef(Reg) &&
diff --git a/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
index b4218bc2afc7f..3d8cda589f46f 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,GFX90A
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,DPPMOV64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX11
 
 ; GCN-LABEL: {{^}}dpp64_ceil:
 ; GCN:           global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]],
@@ -69,6 +69,27 @@ define amdgpu_kernel void @dpp64_div(ptr addrspace(1) %arg, i64 %in1) {
   ret void
 }
 
+; GCN-LABEL: {{^}}dpp64_loop:
+; GCN: v_mov_b32_dpp
+; DPP64: v_mov_b32_dpp
+; GFX90A: v_add_co_u32_e32
+; GFX90A: v_addc_co_u32_e32
+; DPPMOV64: v_lshl_add_u64
+; GFX10: v_mov_b32_dpp
+; GFX10: v_add_co_u32
+; GFX10: v_add_co_ci_u32_e32
+; GFX11: v_add_co_u32_e64_dpp
+; GFX11: v_add_co_ci_u32_e32
+define amdgpu_cs void @dpp64_loop(i64 %arg) {
+bb:
+  br label %bb1
+bb1:
+  %i = call i64 @llvm.amdgcn.update.dpp.i64(i64 0, i64 0, i32 0, i32 0, i32 0, i1 false)
+  %i2 = add i64 %i, %arg
+  %i3 = atomicrmw add ptr addrspace(1) null, i64 %i2 monotonic, align 8
+  br label %bb1
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0
 declare double @llvm.ceil.f64(double)
diff --git a/llvm/test/CodeGen/AMDGPU/machinelicm-undef-use.mir b/llvm/test/CodeGen/AMDGPU/machinelicm-undef-use.mir
new file mode 100644
index 0000000000000..07eaa9f95604d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machinelicm-undef-use.mir
@@ -0,0 +1,79 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=early-machinelicm %s -o - | FileCheck %s
+
+# Issue #100115: test that MachineLICM does not assert on the undef use operand
+# of the REG_SEQUENCE instruction.
+---
+name: test_undef_use
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: test_undef_use
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE undef %3:vgpr_32, %subreg.sub0, undef [[DEF]], %subreg.sub1
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.5(0x04000000), %bb.1(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vcc_lo = COPY undef [[DEF1]]
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.5, implicit $vcc_lo
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x04000000), %bb.3(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit undef $scc
+  ; CHECK-NEXT:   S_BRANCH %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   SI_LOOP undef [[DEF1]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1
+
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:sreg_32 = IMPLICIT_DEF
+
+  bb.1:
+    successors: %bb.3(0x80000000)
+
+    %2:vreg_64 = REG_SEQUENCE undef %3:vgpr_32, %subreg.sub0, undef %0, %subreg.sub1
+    S_BRANCH %bb.3
+
+  bb.2:
+    successors: %bb.5(0x04000000), %bb.1(0x7c000000)
+
+    $vcc_lo = COPY undef %1
+    S_CBRANCH_VCCNZ %bb.5, implicit $vcc
+    S_BRANCH %bb.1
+
+  bb.3:
+    successors: %bb.4(0x04000000), %bb.3(0x7c000000)
+
+    S_CBRANCH_SCC1 %bb.3, implicit undef $scc
+    S_BRANCH %bb.4
+
+  bb.4:
+    successors: %bb.2(0x40000000), %bb.1(0x40000000)
+
+    SI_LOOP undef %1, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.5:
+    S_ENDPGM 0
+...

>From 6da6772bf0a33131aa8540c9d4f60d5db75c32b5 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Wed, 24 Jul 2024 14:30:25 +0100
Subject: [PATCH 002/259] [AArch64][SME] Rewrite __arm_get_current_vg to
 preserve required registers (#100143)

The documentation for the __arm_get_current_vg support routine specifies
that the following registers are call-preserved:
 - X1-X15, X19-X29 and SP
 - Z0-Z31
 - P0-P15

This patch rewrites the implementation of this routine in compiler-rt,
as the current version does not guarantee that these registers will be
preserved.
---
 compiler-rt/lib/builtins/aarch64/sme-abi-vg.c | 28 ------------
 compiler-rt/lib/builtins/aarch64/sme-abi.S    | 44 +++++++++++++++++++
 2 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi-vg.c b/compiler-rt/lib/builtins/aarch64/sme-abi-vg.c
index 062cf80fc6848..20061012e16c6 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi-vg.c
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi-vg.c
@@ -10,15 +10,6 @@ struct FEATURES {
 
 extern struct FEATURES __aarch64_cpu_features;
 
-struct SME_STATE {
-  long PSTATE;
-  long TPIDR2_EL0;
-};
-
-extern struct SME_STATE __arm_sme_state(void) __arm_streaming_compatible;
-
-extern bool __aarch64_has_sme_and_tpidr2_el0;
-
 #if __GNUC__ >= 9
 #pragma GCC diagnostic ignored "-Wprio-ctor-dtor"
 #endif
@@ -28,22 +19,3 @@ __attribute__((constructor(90))) static void get_aarch64_cpu_features(void) {
 
   __init_cpu_features();
 }
-
-__attribute__((target("sve"))) long
-__arm_get_current_vg(void) __arm_streaming_compatible {
-  struct SME_STATE State = __arm_sme_state();
-  unsigned long long features =
-      __atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED);
-  bool HasSVE = features & (1ULL << FEAT_SVE);
-
-  if (!HasSVE && !__aarch64_has_sme_and_tpidr2_el0)
-    return 0;
-
-  if (HasSVE || (State.PSTATE & 1)) {
-    long vl;
-    __asm__ __volatile__("cntd %0" : "=r"(vl));
-    return vl;
-  }
-
-  return 0;
-}
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index 4c0ff66931db7..cd8153f60670f 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -12,11 +12,15 @@
 #if !defined(__APPLE__)
 #define TPIDR2_SYMBOL SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)
 #define TPIDR2_SYMBOL_OFFSET :lo12:SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)
+#define CPU_FEATS_SYMBOL SYMBOL_NAME(__aarch64_cpu_features)
+#define CPU_FEATS_SYMBOL_OFFSET :lo12:SYMBOL_NAME(__aarch64_cpu_features)
 #else
 // MachO requires @page/@pageoff directives because the global is defined
 // in a different file. Otherwise this file may fail to build.
 #define TPIDR2_SYMBOL SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)@page
 #define TPIDR2_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)@pageoff
+#define CPU_FEATS_SYMBOL SYMBOL_NAME(__aarch64_cpu_features)@page
+#define CPU_FEATS_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_cpu_features)@pageoff
 #endif
 
 .arch armv9-a+sme
@@ -180,6 +184,46 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable)
   ret
 END_COMPILERRT_OUTLINE_FUNCTION(__arm_za_disable)
 
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_get_current_vg)
+  .variant_pcs __arm_get_current_vg
+  BTI_C
+
+  stp     x29, x30, [sp, #-16]!
+  .cfi_def_cfa_offset 16
+  mov     x29, sp
+  .cfi_def_cfa w29, 16
+  .cfi_offset w30, -8
+  .cfi_offset w29, -16
+  adrp    x17, CPU_FEATS_SYMBOL
+  ldr     w17, [x17, CPU_FEATS_SYMBOL_OFFSET]
+  tbnz    w17, #30, 0f
+  adrp    x16, TPIDR2_SYMBOL
+  ldrb    w16, [x16, TPIDR2_SYMBOL_OFFSET]
+  cbz     w16, 1f
+0:
+  mov     x18, x1
+  bl      __arm_sme_state
+  mov     x1, x18
+  and     x17, x17, #0x40000000
+  bfxil   x17, x0, #0, #1
+  cbz     x17, 1f
+  cntd    x0
+  .cfi_def_cfa wsp, 16
+  ldp     x29, x30, [sp], #16
+  .cfi_def_cfa_offset 0
+  .cfi_restore w30
+  .cfi_restore w29
+  ret
+1:
+  mov     x0, xzr
+  .cfi_def_cfa wsp, 16
+  ldp     x29, x30, [sp], #16
+  .cfi_def_cfa_offset 0
+  .cfi_restore w30
+  .cfi_restore w29
+  ret
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_get_current_vg)
+
 NO_EXEC_STACK_DIRECTIVE
 
 // GNU property note for BTI and PAC

>From 07688d13414d08a1bea3cecf6607f93e4334222e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 24 Jul 2024 14:32:35 +0100
Subject: [PATCH 003/259] Revert "[LV] Add option to still enable the legacy
 cost model. (#99536)"

This reverts commit 9ba524427321b931bad156860755adf420aeec6a.

Remove the recently added temporary option vectorize-use-legacy-cost-model
as discussed on the PR adding it, now that we branched for 19.x.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 30 +++++++------------
 1 file changed, 10 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e4477f96aa624..b709ccc3c35a7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -206,11 +206,6 @@ static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
     cl::desc("The maximum allowed number of runtime memory checks"));
 
-static cl::opt<bool> UseLegacyCostModel(
-    "vectorize-use-legacy-cost-model", cl::init(false), cl::Hidden,
-    cl::desc("Use the legacy cost model instead of the VPlan-based cost model. "
-             "This option will be removed in the future."));
-
 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
 // that predication is preferred, and this lists all options. I.e., the
 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
@@ -9976,9 +9971,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
                                  &CM, BFI, PSI, Checks);
 
-      VPlan &BestPlan =
-          UseLegacyCostModel ? LVP.getBestPlanFor(VF.Width) : LVP.getBestPlan();
-      assert((UseLegacyCostModel || BestPlan.hasScalarVFOnly()) &&
+      VPlan &BestPlan = LVP.getBestPlan();
+      assert(BestPlan.hasScalarVFOnly() &&
              "VPlan cost model and legacy cost model disagreed");
       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
 
@@ -10095,18 +10089,14 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         if (!MainILV.areSafetyChecksAdded())
           DisableRuntimeUnroll = true;
       } else {
-        ElementCount Width = VF.Width;
-        VPlan &BestPlan =
-            UseLegacyCostModel ? LVP.getBestPlanFor(Width) : LVP.getBestPlan();
-        if (!UseLegacyCostModel) {
-          assert(size(BestPlan.vectorFactors()) == 1 &&
-                 "Plan should have a single VF");
-          Width = *BestPlan.vectorFactors().begin();
-          LLVM_DEBUG(dbgs()
-                     << "VF picked by VPlan cost model: " << Width << "\n");
-          assert(VF.Width == Width &&
-                 "VPlan cost model and legacy cost model disagreed");
-        }
+        VPlan &BestPlan = LVP.getBestPlan();
+        assert(size(BestPlan.vectorFactors()) == 1 &&
+               "Plan should have a single VF");
+        ElementCount Width = *BestPlan.vectorFactors().begin();
+        LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
+                          << "\n");
+        assert(VF.Width == Width &&
+               "VPlan cost model and legacy cost model disagreed");
         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, Width,
                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
                                PSI, Checks);

>From 0b36144aba8d8d439bdc297008eb3bd43a199fca Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 24 Jul 2024 11:37:07 +0100
Subject: [PATCH 004/259] [X86] Add 'clamp out of range index' variable
 shuffles test coverage from #96754

---
 llvm/test/CodeGen/X86/var-permute-128.ll | 887 +++++++++++++++++++++++
 llvm/test/CodeGen/X86/var-permute-256.ll | 759 +++++++++++++++++++
 2 files changed, 1646 insertions(+)

diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index 6350344457d87..52ede755df5d6 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -63,6 +63,145 @@ define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind {
   ret <2 x i64> %ret1
 }
 
+define <2 x i64> @var_shuffle_zero_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind {
+; SSE3-LABEL: var_shuffle_zero_v2i64:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE3-NEXT:    pxor %xmm1, %xmm2
+; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE3-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE3-NEXT:    pand %xmm4, %xmm3
+; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE3-NEXT:    por %xmm3, %xmm2
+; SSE3-NEXT:    por %xmm2, %xmm1
+; SSE3-NEXT:    movq %xmm1, %rax
+; SSE3-NEXT:    andl $1, %eax
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE3-NEXT:    movq %xmm1, %rcx
+; SSE3-NEXT:    andl $1, %ecx
+; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
+; SSE3-NEXT:    movq -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
+; SSE3-NEXT:    movq -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero
+; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE3-NEXT:    pandn %xmm0, %xmm2
+; SSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: var_shuffle_zero_v2i64:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm3, %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    movq %xmm1, %rax
+; SSSE3-NEXT:    andl $1, %eax
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSSE3-NEXT:    movq %xmm1, %rcx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movaps %xmm0, -24(%rsp)
+; SSSE3-NEXT:    movq -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
+; SSSE3-NEXT:    movq -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    pandn %xmm0, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: var_shuffle_zero_v2i64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT:    por %xmm3, %xmm2
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pcmpeqq %xmm1, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
+; SSE41-NEXT:    pandn %xmm4, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; XOP-LABEL: var_shuffle_zero_v2i64:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpcomgtuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; XOP-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shuffle_zero_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [3,3]
+; AVX512-NEXT:    vpcmpnleuq %zmm2, %zmm1, %k1
+; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v2i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovdqa64 %xmm2, %xmm1 {%k1}
+; AVX512VL-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
+  %or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
+  %idx0 = extractelement <2 x i64> %or, i64 0
+  %idx1 = extractelement <2 x i64> %or, i64 1
+  %elt0 = extractelement <2 x i64> %v, i64 %idx0
+  %elt1 = extractelement <2 x i64> %v, i64 %idx1
+  %vec0 = insertelement <2 x i64> poison, i64 %elt0, i64 0
+  %vec1 = insertelement <2 x i64> %vec0, i64 %elt1, i64 1
+  %res = select <2 x i1> %cmp, <2 x i64> zeroinitializer, <2 x i64> %vec1
+  ret <2 x i64> %res
+}
+
 define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
 ; SSE3-LABEL: var_shuffle_v4i32:
 ; SSE3:       # %bb.0:
@@ -126,6 +265,135 @@ define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
   ret <4 x i32> %ret3
 }
 
+define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
+; SSE3-LABEL: var_shuffle_zero_v4i32:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movaps %xmm0, %xmm2
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE3-NEXT:    pxor %xmm1, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    por %xmm0, %xmm1
+; SSE3-NEXT:    movd %xmm1, %eax
+; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
+; SSE3-NEXT:    movd %xmm3, %ecx
+; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; SSE3-NEXT:    movd %xmm3, %edx
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; SSE3-NEXT:    movd %xmm1, %esi
+; SSE3-NEXT:    movaps %xmm2, -24(%rsp)
+; SSE3-NEXT:    andl $3, %eax
+; SSE3-NEXT:    andl $3, %ecx
+; SSE3-NEXT:    andl $3, %edx
+; SSE3-NEXT:    andl $3, %esi
+; SSE3-NEXT:    movd -24(%rsp,%rsi,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movd -24(%rsp,%rdx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE3-NEXT:    movd -24(%rsp,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movd -24(%rsp,%rcx,4), %xmm3 # xmm3 = mem[0],zero,zero,zero
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE3-NEXT:    pandn %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: var_shuffle_zero_v4i32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [67372036,67372036,67372036,67372036]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pmuludq %xmm3, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSSE3-NEXT:    pmuludq %xmm3, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSSE3-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    pandn %xmm0, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: var_shuffle_zero_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm2 = [4,4,4,4]
+; SSE41-NEXT:    pmaxud %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pandn %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; XOP-LABEL: var_shuffle_zero_v4i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpcomgtud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4]
+; AVX2-NEXT:    vpmaxud %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shuffle_zero_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
+; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
+; AVX512-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v4i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovdqa32 %xmm2, %xmm1 {%k1}
+; AVX512VL-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
+  %or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices
+  %idx0 = extractelement <4 x i32> %or, i64 0
+  %idx1 = extractelement <4 x i32> %or, i64 1
+  %idx2 = extractelement <4 x i32> %or, i64 2
+  %idx3 = extractelement <4 x i32> %or, i64 3
+  %elt0 = extractelement <4 x i32> %v, i32 %idx0
+  %elt1 = extractelement <4 x i32> %v, i32 %idx1
+  %elt2 = extractelement <4 x i32> %v, i32 %idx2
+  %elt3 = extractelement <4 x i32> %v, i32 %idx3
+  %vec0 = insertelement <4 x i32> poison, i32 %elt0, i32 0
+  %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
+  %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2
+  %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3
+  %res = select <4 x i1> %cmp, <4 x i32> zeroinitializer, <4 x i32> %vec3
+  ret <4 x i32> %res
+}
+
 define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
 ; SSE3-LABEL: var_shuffle_v8i16:
 ; SSE3:       # %bb.0:
@@ -223,6 +491,156 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
   ret <8 x i16> %ret7
 }
 
+define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
+; SSE3-LABEL: var_shuffle_zero_v8i16:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8]
+; SSE3-NEXT:    psubusw %xmm1, %xmm3
+; SSE3-NEXT:    pxor %xmm0, %xmm0
+; SSE3-NEXT:    pcmpeqw %xmm3, %xmm0
+; SSE3-NEXT:    por %xmm0, %xmm1
+; SSE3-NEXT:    pextrw $0, %xmm1, %eax
+; SSE3-NEXT:    pextrw $1, %xmm1, %ecx
+; SSE3-NEXT:    pextrw $2, %xmm1, %edx
+; SSE3-NEXT:    pextrw $3, %xmm1, %esi
+; SSE3-NEXT:    pextrw $4, %xmm1, %edi
+; SSE3-NEXT:    pextrw $5, %xmm1, %r8d
+; SSE3-NEXT:    pextrw $6, %xmm1, %r9d
+; SSE3-NEXT:    pextrw $7, %xmm1, %r10d
+; SSE3-NEXT:    movdqa %xmm2, -24(%rsp)
+; SSE3-NEXT:    andl $7, %eax
+; SSE3-NEXT:    andl $7, %ecx
+; SSE3-NEXT:    andl $7, %edx
+; SSE3-NEXT:    andl $7, %esi
+; SSE3-NEXT:    andl $7, %edi
+; SSE3-NEXT:    andl $7, %r8d
+; SSE3-NEXT:    andl $7, %r9d
+; SSE3-NEXT:    andl $7, %r10d
+; SSE3-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
+; SSE3-NEXT:    movd %r10d, %xmm1
+; SSE3-NEXT:    movzwl -24(%rsp,%r9,2), %r9d
+; SSE3-NEXT:    movd %r9d, %xmm2
+; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    movzwl -24(%rsp,%r8,2), %r8d
+; SSE3-NEXT:    movd %r8d, %xmm1
+; SSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
+; SSE3-NEXT:    movd %edi, %xmm3
+; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
+; SSE3-NEXT:    movd %esi, %xmm1
+; SSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %edx
+; SSE3-NEXT:    movd %edx, %xmm2
+; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
+; SSE3-NEXT:    movd %ecx, %xmm1
+; SSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; SSE3-NEXT:    movd %eax, %xmm4
+; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
+; SSE3-NEXT:    pandn %xmm4, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: var_shuffle_zero_v8i16:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8]
+; SSSE3-NEXT:    psubusw %xmm1, %xmm3
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pcmpeqw %xmm3, %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514]
+; SSSE3-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    pandn %xmm0, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: var_shuffle_zero_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxbw {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8]
+; SSE41-NEXT:    pmaxuw %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqw %xmm1, %xmm2
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514]
+; SSE41-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pandn %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; XOP-LABEL: var_shuffle_zero_v8i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpcomgtuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
+; XOP-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
+; AVX1-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
+; AVX2-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v8i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovdqu16 %xmm2, %xmm1 {%k1}
+; AVX512VL-NEXT:    vpermw %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <8 x i16> %indices, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  %or = select <8 x i1> %cmp, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %indices
+  %idx0 = extractelement <8 x i16> %or, i64 0
+  %idx1 = extractelement <8 x i16> %or, i64 1
+  %idx2 = extractelement <8 x i16> %or, i64 2
+  %idx3 = extractelement <8 x i16> %or, i64 3
+  %idx4 = extractelement <8 x i16> %or, i64 4
+  %idx5 = extractelement <8 x i16> %or, i64 5
+  %idx6 = extractelement <8 x i16> %or, i64 6
+  %idx7 = extractelement <8 x i16> %or, i64 7
+  %elt0 = extractelement <8 x i16> %v, i16 %idx0
+  %elt1 = extractelement <8 x i16> %v, i16 %idx1
+  %elt2 = extractelement <8 x i16> %v, i16 %idx2
+  %elt3 = extractelement <8 x i16> %v, i16 %idx3
+  %elt4 = extractelement <8 x i16> %v, i16 %idx4
+  %elt5 = extractelement <8 x i16> %v, i16 %idx5
+  %elt6 = extractelement <8 x i16> %v, i16 %idx6
+  %elt7 = extractelement <8 x i16> %v, i16 %idx7
+  %vec0 = insertelement <8 x i16> poison, i16 %elt0, i64 0
+  %vec1 = insertelement <8 x i16> %vec0, i16 %elt1, i64 1
+  %vec2 = insertelement <8 x i16> %vec1, i16 %elt2, i64 2
+  %vec3 = insertelement <8 x i16> %vec2, i16 %elt3, i64 3
+  %vec4 = insertelement <8 x i16> %vec3, i16 %elt4, i64 4
+  %vec5 = insertelement <8 x i16> %vec4, i16 %elt5, i64 5
+  %vec6 = insertelement <8 x i16> %vec5, i16 %elt6, i64 6
+  %vec7 = insertelement <8 x i16> %vec6, i16 %elt7, i64 7
+  %res = select <8 x i1> %cmp, <8 x i16> zeroinitializer, <8 x i16> %vec7
+  ret <8 x i16> %res
+}
+
 define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
 ; SSE3-LABEL: var_shuffle_v16i8:
 ; SSE3:       # %bb.0:
@@ -374,6 +792,209 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
   ret <16 x i8> %ret15
 }
 
+define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
+; SSE3-LABEL: var_shuffle_zero_v16i8:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movaps %xmm0, %xmm2
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE3-NEXT:    pmaxub %xmm1, %xmm0
+; SSE3-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSE3-NEXT:    por %xmm0, %xmm1
+; SSE3-NEXT:    movdqa %xmm1, -40(%rsp)
+; SSE3-NEXT:    movaps %xmm2, -24(%rsp)
+; SSE3-NEXT:    movzbl -25(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm1
+; SSE3-NEXT:    movzbl -26(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm2
+; SSE3-NEXT:    movzbl -27(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm4
+; SSE3-NEXT:    movzbl -28(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm3
+; SSE3-NEXT:    movzbl -29(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm6
+; SSE3-NEXT:    movzbl -30(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm7
+; SSE3-NEXT:    movzbl -31(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm8
+; SSE3-NEXT:    movzbl -32(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm5
+; SSE3-NEXT:    movzbl -33(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm9
+; SSE3-NEXT:    movzbl -34(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm10
+; SSE3-NEXT:    movzbl -35(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm12
+; SSE3-NEXT:    movzbl -36(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm11
+; SSE3-NEXT:    movzbl -37(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm13
+; SSE3-NEXT:    movzbl -38(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm14
+; SSE3-NEXT:    movzbl -39(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm15
+; SSE3-NEXT:    movzbl -40(%rsp), %eax
+; SSE3-NEXT:    andl $15, %eax
+; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE3-NEXT:    movd %eax, %xmm1
+; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
+; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; SSE3-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; SSE3-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
+; SSE3-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; SSE3-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
+; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3]
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1]
+; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE3-NEXT:    pandn %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: var_shuffle_zero_v16i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSSE3-NEXT:    pmaxub %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpeqb %xmm1, %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    pandn %xmm0, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: var_shuffle_zero_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE41-NEXT:    pmaxub %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqb %xmm1, %xmm2
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pandn %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; XOP-LABEL: var_shuffle_zero_v16i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpcomgtub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovdqu8 %xmm2, %xmm1 {%k1}
+; AVX512VL-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <16 x i8> %indices, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
+  %or = select <16 x i1> %cmp, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %indices
+  %idx0 = extractelement <16 x i8> %or, i64 0
+  %idx1 = extractelement <16 x i8> %or, i64 1
+  %idx2 = extractelement <16 x i8> %or, i64 2
+  %idx3 = extractelement <16 x i8> %or, i64 3
+  %idx4 = extractelement <16 x i8> %or, i64 4
+  %idx5 = extractelement <16 x i8> %or, i64 5
+  %idx6 = extractelement <16 x i8> %or, i64 6
+  %idx7 = extractelement <16 x i8> %or, i64 7
+  %idx8 = extractelement <16 x i8> %or, i64 8
+  %idx9 = extractelement <16 x i8> %or, i64 9
+  %idxA = extractelement <16 x i8> %or, i64 10
+  %idxB = extractelement <16 x i8> %or, i64 11
+  %idxC = extractelement <16 x i8> %or, i64 12
+  %idxD = extractelement <16 x i8> %or, i64 13
+  %idxE = extractelement <16 x i8> %or, i64 14
+  %idxF = extractelement <16 x i8> %or, i64 15
+  %elt0 = extractelement <16 x i8> %v, i8 %idx0
+  %elt1 = extractelement <16 x i8> %v, i8 %idx1
+  %elt2 = extractelement <16 x i8> %v, i8 %idx2
+  %elt3 = extractelement <16 x i8> %v, i8 %idx3
+  %elt4 = extractelement <16 x i8> %v, i8 %idx4
+  %elt5 = extractelement <16 x i8> %v, i8 %idx5
+  %elt6 = extractelement <16 x i8> %v, i8 %idx6
+  %elt7 = extractelement <16 x i8> %v, i8 %idx7
+  %elt8 = extractelement <16 x i8> %v, i8 %idx8
+  %elt9 = extractelement <16 x i8> %v, i8 %idx9
+  %eltA = extractelement <16 x i8> %v, i8 %idxA
+  %eltB = extractelement <16 x i8> %v, i8 %idxB
+  %eltC = extractelement <16 x i8> %v, i8 %idxC
+  %eltD = extractelement <16 x i8> %v, i8 %idxD
+  %eltE = extractelement <16 x i8> %v, i8 %idxE
+  %eltF = extractelement <16 x i8> %v, i8 %idxF
+  %vec0 = insertelement <16 x i8> poison, i8 %elt0, i64 0
+  %vec1 = insertelement <16 x i8> %vec0, i8 %elt1, i64 1
+  %vec2 = insertelement <16 x i8> %vec1, i8 %elt2, i64 2
+  %vec3 = insertelement <16 x i8> %vec2, i8 %elt3, i64 3
+  %vec4 = insertelement <16 x i8> %vec3, i8 %elt4, i64 4
+  %vec5 = insertelement <16 x i8> %vec4, i8 %elt5, i64 5
+  %vec6 = insertelement <16 x i8> %vec5, i8 %elt6, i64 6
+  %vec7 = insertelement <16 x i8> %vec6, i8 %elt7, i64 7
+  %vec8 = insertelement <16 x i8> %vec7, i8 %elt8, i64 8
+  %vec9 = insertelement <16 x i8> %vec8, i8 %elt9, i64 9
+  %vecA = insertelement <16 x i8> %vec9, i8 %eltA, i64 10
+  %vecB = insertelement <16 x i8> %vecA, i8 %eltB, i64 11
+  %vecC = insertelement <16 x i8> %vecB, i8 %eltC, i64 12
+  %vecD = insertelement <16 x i8> %vecC, i8 %eltD, i64 13
+  %vecE = insertelement <16 x i8> %vecD, i8 %eltE, i64 14
+  %vecF = insertelement <16 x i8> %vecE, i8 %eltF, i64 15
+  %res = select <16 x i1> %cmp, <16 x i8> zeroinitializer, <16 x i8> %vecF
+  ret <16 x i8> %res
+}
+
 define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) nounwind {
 ; SSE3-LABEL: var_shuffle_v2f64:
 ; SSE3:       # %bb.0:
@@ -424,6 +1045,143 @@ define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) noun
   ret <2 x double> %ret1
 }
 
+define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices) nounwind {
+; SSE3-LABEL: var_shuffle_zero_v2f64:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE3-NEXT:    pxor %xmm1, %xmm2
+; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE3-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE3-NEXT:    pand %xmm4, %xmm3
+; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE3-NEXT:    por %xmm3, %xmm2
+; SSE3-NEXT:    por %xmm2, %xmm1
+; SSE3-NEXT:    movq %xmm1, %rax
+; SSE3-NEXT:    andl $1, %eax
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE3-NEXT:    movq %xmm1, %rcx
+; SSE3-NEXT:    andl $1, %ecx
+; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
+; SSE3-NEXT:    movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
+; SSE3-NEXT:    movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
+; SSE3-NEXT:    pandn %xmm0, %xmm2
+; SSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: var_shuffle_zero_v2f64:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    por %xmm3, %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    movq %xmm1, %rax
+; SSSE3-NEXT:    andl $1, %eax
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSSE3-NEXT:    movq %xmm1, %rcx
+; SSSE3-NEXT:    andl $1, %ecx
+; SSSE3-NEXT:    movaps %xmm0, -24(%rsp)
+; SSSE3-NEXT:    movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
+; SSSE3-NEXT:    movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
+; SSSE3-NEXT:    pandn %xmm0, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: var_shuffle_zero_v2f64:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movapd %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE41-NEXT:    pand %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT:    por %xmm4, %xmm3
+; SSE41-NEXT:    por %xmm3, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
+; SSE41-NEXT:    movddup {{.*#+}} xmm1 = xmm2[0,0]
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    pandn %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    retq
+;
+; XOP-LABEL: var_shuffle_zero_v2f64:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpcomgtuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; XOP-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v2f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v2f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shuffle_zero_v2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [3,3]
+; AVX512-NEXT:    vpcmpnleuq %zmm2, %zmm1, %k1
+; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vmovapd %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v2f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovdqa64 %xmm2, %xmm1 {%k1}
+; AVX512VL-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovapd %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
+  %or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
+  %idx0 = extractelement <2 x i64> %or, i64 0
+  %idx1 = extractelement <2 x i64> %or, i64 1
+  %elt0 = extractelement <2 x double> %v, i64 %idx0
+  %elt1 = extractelement <2 x double> %v, i64 %idx1
+  %vec0 = insertelement <2 x double> poison, double %elt0, i64 0
+  %vec1 = insertelement <2 x double> %vec0, double %elt1, i64 1
+  %res = select <2 x i1> %cmp, <2 x double> zeroinitializer, <2 x double> %vec1
+  ret <2 x double> %res
+}
+
 define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind {
 ; SSE3-LABEL: var_shuffle_v4f32:
 ; SSE3:       # %bb.0:
@@ -487,6 +1245,135 @@ define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwi
   ret <4 x float> %ret3
 }
 
+define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind {
+; SSE3-LABEL: var_shuffle_zero_v4f32:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movaps %xmm0, %xmm2
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE3-NEXT:    pxor %xmm1, %xmm0
+; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT:    por %xmm0, %xmm1
+; SSE3-NEXT:    movd %xmm1, %eax
+; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
+; SSE3-NEXT:    movd %xmm3, %ecx
+; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; SSE3-NEXT:    movd %xmm3, %edx
+; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; SSE3-NEXT:    movd %xmm1, %esi
+; SSE3-NEXT:    movaps %xmm2, -24(%rsp)
+; SSE3-NEXT:    andl $3, %eax
+; SSE3-NEXT:    andl $3, %ecx
+; SSE3-NEXT:    andl $3, %edx
+; SSE3-NEXT:    andl $3, %esi
+; SSE3-NEXT:    movd -24(%rsp,%rsi,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movd -24(%rsp,%rdx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE3-NEXT:    movd -24(%rsp,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movd -24(%rsp,%rcx,4), %xmm3 # xmm3 = mem[0],zero,zero,zero
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE3-NEXT:    pandn %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: var_shuffle_zero_v4f32:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [67372036,67372036,67372036,67372036]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pmuludq %xmm3, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSSE3-NEXT:    pmuludq %xmm3, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSSE3-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    pandn %xmm0, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: var_shuffle_zero_v4f32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm2 = [4,4,4,4]
+; SSE41-NEXT:    pmaxud %xmm1, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pandn %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; XOP-LABEL: var_shuffle_zero_v4f32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vpcomgtud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v4f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v4f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4]
+; AVX2-NEXT:    vpmaxud %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shuffle_zero_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
+; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
+; AVX512-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v4f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovdqa32 %xmm2, %xmm1 {%k1}
+; AVX512VL-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovaps %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
+  %or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices
+  %idx0 = extractelement <4 x i32> %or, i64 0
+  %idx1 = extractelement <4 x i32> %or, i64 1
+  %idx2 = extractelement <4 x i32> %or, i64 2
+  %idx3 = extractelement <4 x i32> %or, i64 3
+  %elt0 = extractelement <4 x float> %v, i32 %idx0
+  %elt1 = extractelement <4 x float> %v, i32 %idx1
+  %elt2 = extractelement <4 x float> %v, i32 %idx2
+  %elt3 = extractelement <4 x float> %v, i32 %idx3
+  %vec0 = insertelement <4 x float> poison, float %elt0, i64 0
+  %vec1 = insertelement <4 x float> %vec0, float %elt1, i64 1
+  %vec2 = insertelement <4 x float> %vec1, float %elt2, i64 2
+  %vec3 = insertelement <4 x float> %vec2, float %elt3, i64 3
+  %res = select <4 x i1> %cmp, <4 x float> zeroinitializer, <4 x float> %vec3
+  ret <4 x float> %res
+}
+
 define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %indices) nounwind {
 ; SSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
 ; SSE3:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll
index 0baa5bdf27624..2968c97a1355c 100644
--- a/llvm/test/CodeGen/X86/var-permute-256.ll
+++ b/llvm/test/CodeGen/X86/var-permute-256.ll
@@ -78,6 +78,111 @@ define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind {
   ret <4 x i64> %ret3
 }
 
+define <4 x i64> @var_shuffle_zero_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind {
+; XOP-LABEL: var_shuffle_zero_v4i64:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [3,3]
+; XOP-NEXT:    vpcomgtuq %xmm3, %xmm2, %xmm2
+; XOP-NEXT:    vpcomgtuq %xmm3, %xmm1, %xmm3
+; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; XOP-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm3
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; XOP-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; XOP-NEXT:    vpermil2pd $0, %ymm1, %ymm3, %ymm0, %ymm0
+; XOP-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [9223372036854775811,9223372036854775811]
+; AVX1-NEXT:    # xmm4 = mem[0,0]
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vpermilpd %ymm1, %ymm3, %ymm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775811,9223372036854775811,9223372036854775811,9223372036854775811]
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [2,2,2,2]
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm1, %ymm3
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[2,3,2,3]
+; AVX2-NEXT:    vpermilpd %ymm1, %ymm4, %ymm4
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shuffle_zero_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [3,3,3,3]
+; AVX512-NEXT:    vpcmpleuq %zmm2, %zmm1, %k1
+; AVX512-NEXT:    vpcmpnleuq %zmm2, %zmm1, %k2
+; AVX512-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k2}
+; AVX512-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v4i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vmovdqa64 %ymm2, %ymm1 {%k1}
+; AVX512VL-NEXT:    vpermq %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <4 x i64> %indices, <i64 3, i64 3, i64 3, i64 3>
+  %or = select <4 x i1> %cmp, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> %indices
+  %idx0 = extractelement <4 x i64> %or, i64 0
+  %idx1 = extractelement <4 x i64> %or, i64 1
+  %idx2 = extractelement <4 x i64> %or, i64 2
+  %idx3 = extractelement <4 x i64> %or, i64 3
+  %elt0 = extractelement <4 x i64> %v, i64 %idx0
+  %elt1 = extractelement <4 x i64> %v, i64 %idx1
+  %elt2 = extractelement <4 x i64> %v, i64 %idx2
+  %elt3 = extractelement <4 x i64> %v, i64 %idx3
+  %vec0 = insertelement <4 x i64> poison, i64 %elt0, i64 0
+  %vec1 = insertelement <4 x i64> %vec0, i64 %elt1, i64 1
+  %vec2 = insertelement <4 x i64> %vec1, i64 %elt2, i64 2
+  %vec3 = insertelement <4 x i64> %vec2, i64 %elt3, i64 3
+  %res = select <4 x i1> %cmp, <4 x i64> zeroinitializer, <4 x i64> %vec3
+  ret <4 x i64> %res
+}
+
 define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
 ; XOP-LABEL: var_shuffle_v8i32:
 ; XOP:       # %bb.0:
@@ -130,6 +235,104 @@ define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
   ret <8 x i32> %ret7
 }
 
+define <8 x i32> @var_shuffle_zero_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
+; XOP-LABEL: var_shuffle_zero_v8i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm3 = [7,7,7,7]
+; XOP-NEXT:    vpcomgtud %xmm3, %xmm2, %xmm2
+; XOP-NEXT:    vpcomgtud %xmm3, %xmm1, %xmm3
+; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; XOP-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; XOP-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; XOP-NEXT:    vpermil2ps $0, %ymm1, %ymm3, %ymm0, %ymm0
+; XOP-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [8,8,8,8]
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vpermilps %ymm1, %ymm4, %ymm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vblendvps %ymm3, %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8]
+; AVX2-NEXT:    vpmaxud %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shuffle_zero_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
+; AVX512-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
+; AVX512-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v8i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vmovdqa32 %ymm2, %ymm1 {%k1}
+; AVX512VL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <8 x i32> %indices, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %or = select <8 x i1> %cmp, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %indices
+  %idx0 = extractelement <8 x i32> %or, i64 0
+  %idx1 = extractelement <8 x i32> %or, i64 1
+  %idx2 = extractelement <8 x i32> %or, i64 2
+  %idx3 = extractelement <8 x i32> %or, i64 3
+  %idx4 = extractelement <8 x i32> %or, i64 4
+  %idx5 = extractelement <8 x i32> %or, i64 5
+  %idx6 = extractelement <8 x i32> %or, i64 6
+  %idx7 = extractelement <8 x i32> %or, i64 7
+  %elt0 = extractelement <8 x i32> %v, i32 %idx0
+  %elt1 = extractelement <8 x i32> %v, i32 %idx1
+  %elt2 = extractelement <8 x i32> %v, i32 %idx2
+  %elt3 = extractelement <8 x i32> %v, i32 %idx3
+  %elt4 = extractelement <8 x i32> %v, i32 %idx4
+  %elt5 = extractelement <8 x i32> %v, i32 %idx5
+  %elt6 = extractelement <8 x i32> %v, i32 %idx6
+  %elt7 = extractelement <8 x i32> %v, i32 %idx7
+  %vec0 = insertelement <8 x i32> poison, i32 %elt0, i64 0
+  %vec1 = insertelement <8 x i32> %vec0, i32 %elt1, i64 1
+  %vec2 = insertelement <8 x i32> %vec1, i32 %elt2, i64 2
+  %vec3 = insertelement <8 x i32> %vec2, i32 %elt3, i64 3
+  %vec4 = insertelement <8 x i32> %vec3, i32 %elt4, i64 4
+  %vec5 = insertelement <8 x i32> %vec4, i32 %elt5, i64 5
+  %vec6 = insertelement <8 x i32> %vec5, i32 %elt6, i64 6
+  %vec7 = insertelement <8 x i32> %vec6, i32 %elt7, i64 7
+  %res = select <8 x i1> %cmp, <8 x i32> zeroinitializer, <8 x i32> %vec7
+  ret <8 x i32> %res
+}
+
 define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwind {
 ; XOP-LABEL: var_shuffle_v16i16:
 ; XOP:       # %bb.0:
@@ -262,6 +465,163 @@ define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwi
   ret <16 x i16> %ret15
 }
 
+define <16 x i16> @var_shuffle_zero_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwind {
+; XOP-LABEL: var_shuffle_zero_v16i16:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; XOP-NEXT:    vpcomgtuw %xmm3, %xmm2, %xmm2
+; XOP-NEXT:    vpcomgtuw %xmm3, %xmm1, %xmm3
+; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; XOP-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm3 = [256,256,256,256,256,256,256,256]
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm4 = [514,514,514,514,514,514,514,514]
+; XOP-NEXT:    vpmacsww %xmm3, %xmm4, %xmm1, %xmm5
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; XOP-NEXT:    vpmacsww %xmm3, %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOP-NEXT:    vpperm %xmm1, %xmm3, %xmm0, %xmm1
+; XOP-NEXT:    vpperm %xmm5, %xmm3, %xmm0, %xmm0
+; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOP-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpmaxuw %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpmaxuw %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514]
+; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [256,256,256,256,256,256,256,256]
+; AVX1-NEXT:    vpaddw %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm7
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpshufb %xmm4, %xmm6, %xmm5
+; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
+; AVX2-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
+; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpblendvb %ymm1, %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: var_shuffle_zero_v16i16:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm2
+; AVX512VLDQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VLDQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
+; AVX512VLDQ-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
+; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
+; AVX512VLDQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX512VLDQ-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512VLDQ-NEXT:    vpternlogq $202, %ymm3, %ymm0, %ymm1
+; AVX512VLDQ-NEXT:    vpandn %ymm1, %ymm2, %ymm0
+; AVX512VLDQ-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_shuffle_zero_v16i16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
+; AVX512VLBW-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vmovdqu16 %ymm2, %ymm1 {%k1}
+; AVX512VLBW-NEXT:    vpermw %ymm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT:    retq
+;
+; VLVBMI-LABEL: var_shuffle_zero_v16i16:
+; VLVBMI:       # %bb.0:
+; VLVBMI-NEXT:    vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
+; VLVBMI-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; VLVBMI-NEXT:    vmovdqu16 %ymm2, %ymm1 {%k1}
+; VLVBMI-NEXT:    vpermw %ymm0, %ymm1, %ymm0
+; VLVBMI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLVBMI-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
+; VLVBMI-NEXT:    retq
+  %cmp = icmp ugt <16 x i16> %indices, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %or = select <16 x i1> %cmp, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <16 x i16> %indices
+  %idx0 = extractelement <16 x i16> %or, i64 0
+  %idx1 = extractelement <16 x i16> %or, i64 1
+  %idx2 = extractelement <16 x i16> %or, i64 2
+  %idx3 = extractelement <16 x i16> %or, i64 3
+  %idx4 = extractelement <16 x i16> %or, i64 4
+  %idx5 = extractelement <16 x i16> %or, i64 5
+  %idx6 = extractelement <16 x i16> %or, i64 6
+  %idx7 = extractelement <16 x i16> %or, i64 7
+  %idx8 = extractelement <16 x i16> %or, i64 8
+  %idx9 = extractelement <16 x i16> %or, i64 9
+  %idxA = extractelement <16 x i16> %or, i64 10
+  %idxB = extractelement <16 x i16> %or, i64 11
+  %idxC = extractelement <16 x i16> %or, i64 12
+  %idxD = extractelement <16 x i16> %or, i64 13
+  %idxE = extractelement <16 x i16> %or, i64 14
+  %idxF = extractelement <16 x i16> %or, i64 15
+  %elt0 = extractelement <16 x i16> %v, i16 %idx0
+  %elt1 = extractelement <16 x i16> %v, i16 %idx1
+  %elt2 = extractelement <16 x i16> %v, i16 %idx2
+  %elt3 = extractelement <16 x i16> %v, i16 %idx3
+  %elt4 = extractelement <16 x i16> %v, i16 %idx4
+  %elt5 = extractelement <16 x i16> %v, i16 %idx5
+  %elt6 = extractelement <16 x i16> %v, i16 %idx6
+  %elt7 = extractelement <16 x i16> %v, i16 %idx7
+  %elt8 = extractelement <16 x i16> %v, i16 %idx8
+  %elt9 = extractelement <16 x i16> %v, i16 %idx9
+  %eltA = extractelement <16 x i16> %v, i16 %idxA
+  %eltB = extractelement <16 x i16> %v, i16 %idxB
+  %eltC = extractelement <16 x i16> %v, i16 %idxC
+  %eltD = extractelement <16 x i16> %v, i16 %idxD
+  %eltE = extractelement <16 x i16> %v, i16 %idxE
+  %eltF = extractelement <16 x i16> %v, i16 %idxF
+  %vec0 = insertelement <16 x i16> poison, i16 %elt0, i64 0
+  %vec1 = insertelement <16 x i16> %vec0, i16 %elt1, i64 1
+  %vec2 = insertelement <16 x i16> %vec1, i16 %elt2, i64 2
+  %vec3 = insertelement <16 x i16> %vec2, i16 %elt3, i64 3
+  %vec4 = insertelement <16 x i16> %vec3, i16 %elt4, i64 4
+  %vec5 = insertelement <16 x i16> %vec4, i16 %elt5, i64 5
+  %vec6 = insertelement <16 x i16> %vec5, i16 %elt6, i64 6
+  %vec7 = insertelement <16 x i16> %vec6, i16 %elt7, i64 7
+  %vec8 = insertelement <16 x i16> %vec7, i16 %elt8, i64 8
+  %vec9 = insertelement <16 x i16> %vec8, i16 %elt9, i64 9
+  %vecA = insertelement <16 x i16> %vec9, i16 %eltA, i64 10
+  %vecB = insertelement <16 x i16> %vecA, i16 %eltB, i64 11
+  %vecC = insertelement <16 x i16> %vecB, i16 %eltC, i64 12
+  %vecD = insertelement <16 x i16> %vecC, i16 %eltD, i64 13
+  %vecE = insertelement <16 x i16> %vecD, i16 %eltE, i64 14
+  %vecF = insertelement <16 x i16> %vecE, i16 %eltF, i64 15
+  %res = select <16 x i1> %cmp, <16 x i16> zeroinitializer, <16 x i16> %vecF
+  ret <16 x i16> %res
+}
+
 define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
 ; XOP-LABEL: var_shuffle_v32i8:
 ; XOP:       # %bb.0:
@@ -431,6 +791,202 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
   ret <32 x i8> %ret31
 }
 
+define <32 x i8> @var_shuffle_zero_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
+; XOP-LABEL: var_shuffle_zero_v32i8:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; XOP-NEXT:    vpcomgtub %xmm3, %xmm2, %xmm2
+; XOP-NEXT:    vpcomgtub %xmm3, %xmm1, %xmm3
+; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; XOP-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOP-NEXT:    vpperm %xmm3, %xmm4, %xmm0, %xmm3
+; XOP-NEXT:    vpperm %xmm1, %xmm4, %xmm0, %xmm0
+; XOP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; XOP-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT:    vpmaxub %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpeqb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpmaxub %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm7
+; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm5
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
+; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpblendvb %ymm1, %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: var_shuffle_zero_v32i8:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm2
+; AVX512VLDQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
+; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
+; AVX512VLDQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX512VLDQ-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512VLDQ-NEXT:    vpternlogq $202, %ymm3, %ymm0, %ymm1
+; AVX512VLDQ-NEXT:    vpandn %ymm1, %ymm2, %ymm0
+; AVX512VLDQ-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_shuffle_zero_v32i8:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
+; AVX512VLBW-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vmovdqu8 %ymm2, %ymm1 {%k1}
+; AVX512VLBW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm2
+; AVX512VLBW-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k2
+; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512VLBW-NEXT:    vpshufb %ymm1, %ymm0, %ymm2 {%k2}
+; AVX512VLBW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vmovdqu8 %ymm0, %ymm2 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; VLVBMI-LABEL: var_shuffle_zero_v32i8:
+; VLVBMI:       # %bb.0:
+; VLVBMI-NEXT:    vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
+; VLVBMI-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; VLVBMI-NEXT:    vmovdqu8 %ymm2, %ymm1 {%k1}
+; VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; VLVBMI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLVBMI-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
+; VLVBMI-NEXT:    retq
+  %cmp = icmp ugt <32 x i8> %indices, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
+  %or = select <32 x i1> %cmp, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> %indices
+  %idx00 = extractelement <32 x i8> %or, i64 0
+  %idx01 = extractelement <32 x i8> %or, i64 1
+  %idx02 = extractelement <32 x i8> %or, i64 2
+  %idx03 = extractelement <32 x i8> %or, i64 3
+  %idx04 = extractelement <32 x i8> %or, i64 4
+  %idx05 = extractelement <32 x i8> %or, i64 5
+  %idx06 = extractelement <32 x i8> %or, i64 6
+  %idx07 = extractelement <32 x i8> %or, i64 7
+  %idx08 = extractelement <32 x i8> %or, i64 8
+  %idx09 = extractelement <32 x i8> %or, i64 9
+  %idx0A = extractelement <32 x i8> %or, i64 10
+  %idx0B = extractelement <32 x i8> %or, i64 11
+  %idx0C = extractelement <32 x i8> %or, i64 12
+  %idx0D = extractelement <32 x i8> %or, i64 13
+  %idx0E = extractelement <32 x i8> %or, i64 14
+  %idx0F = extractelement <32 x i8> %or, i64 15
+  %idx10 = extractelement <32 x i8> %or, i64 16
+  %idx11 = extractelement <32 x i8> %or, i64 17
+  %idx12 = extractelement <32 x i8> %or, i64 18
+  %idx13 = extractelement <32 x i8> %or, i64 19
+  %idx14 = extractelement <32 x i8> %or, i64 20
+  %idx15 = extractelement <32 x i8> %or, i64 21
+  %idx16 = extractelement <32 x i8> %or, i64 22
+  %idx17 = extractelement <32 x i8> %or, i64 23
+  %idx18 = extractelement <32 x i8> %or, i64 24
+  %idx19 = extractelement <32 x i8> %or, i64 25
+  %idx1A = extractelement <32 x i8> %or, i64 26
+  %idx1B = extractelement <32 x i8> %or, i64 27
+  %idx1C = extractelement <32 x i8> %or, i64 28
+  %idx1D = extractelement <32 x i8> %or, i64 29
+  %idx1E = extractelement <32 x i8> %or, i64 30
+  %idx1F = extractelement <32 x i8> %or, i64 31
+  %elt00 = extractelement <32 x i8> %v, i8 %idx00
+  %elt01 = extractelement <32 x i8> %v, i8 %idx01
+  %elt02 = extractelement <32 x i8> %v, i8 %idx02
+  %elt03 = extractelement <32 x i8> %v, i8 %idx03
+  %elt04 = extractelement <32 x i8> %v, i8 %idx04
+  %elt05 = extractelement <32 x i8> %v, i8 %idx05
+  %elt06 = extractelement <32 x i8> %v, i8 %idx06
+  %elt07 = extractelement <32 x i8> %v, i8 %idx07
+  %elt08 = extractelement <32 x i8> %v, i8 %idx08
+  %elt09 = extractelement <32 x i8> %v, i8 %idx09
+  %elt0A = extractelement <32 x i8> %v, i8 %idx0A
+  %elt0B = extractelement <32 x i8> %v, i8 %idx0B
+  %elt0C = extractelement <32 x i8> %v, i8 %idx0C
+  %elt0D = extractelement <32 x i8> %v, i8 %idx0D
+  %elt0E = extractelement <32 x i8> %v, i8 %idx0E
+  %elt0F = extractelement <32 x i8> %v, i8 %idx0F
+  %elt10 = extractelement <32 x i8> %v, i8 %idx10
+  %elt11 = extractelement <32 x i8> %v, i8 %idx11
+  %elt12 = extractelement <32 x i8> %v, i8 %idx12
+  %elt13 = extractelement <32 x i8> %v, i8 %idx13
+  %elt14 = extractelement <32 x i8> %v, i8 %idx14
+  %elt15 = extractelement <32 x i8> %v, i8 %idx15
+  %elt16 = extractelement <32 x i8> %v, i8 %idx16
+  %elt17 = extractelement <32 x i8> %v, i8 %idx17
+  %elt18 = extractelement <32 x i8> %v, i8 %idx18
+  %elt19 = extractelement <32 x i8> %v, i8 %idx19
+  %elt1A = extractelement <32 x i8> %v, i8 %idx1A
+  %elt1B = extractelement <32 x i8> %v, i8 %idx1B
+  %elt1C = extractelement <32 x i8> %v, i8 %idx1C
+  %elt1D = extractelement <32 x i8> %v, i8 %idx1D
+  %elt1E = extractelement <32 x i8> %v, i8 %idx1E
+  %elt1F = extractelement <32 x i8> %v, i8 %idx1F
+  %vec00 = insertelement <32 x i8> poison, i8 %elt00, i64 0
+  %vec01 = insertelement <32 x i8> %vec00, i8 %elt01, i64 1
+  %vec02 = insertelement <32 x i8> %vec01, i8 %elt02, i64 2
+  %vec03 = insertelement <32 x i8> %vec02, i8 %elt03, i64 3
+  %vec04 = insertelement <32 x i8> %vec03, i8 %elt04, i64 4
+  %vec05 = insertelement <32 x i8> %vec04, i8 %elt05, i64 5
+  %vec06 = insertelement <32 x i8> %vec05, i8 %elt06, i64 6
+  %vec07 = insertelement <32 x i8> %vec06, i8 %elt07, i64 7
+  %vec08 = insertelement <32 x i8> %vec07, i8 %elt08, i64 8
+  %vec09 = insertelement <32 x i8> %vec08, i8 %elt09, i64 9
+  %vec0A = insertelement <32 x i8> %vec09, i8 %elt0A, i64 10
+  %vec0B = insertelement <32 x i8> %vec0A, i8 %elt0B, i64 11
+  %vec0C = insertelement <32 x i8> %vec0B, i8 %elt0C, i64 12
+  %vec0D = insertelement <32 x i8> %vec0C, i8 %elt0D, i64 13
+  %vec0E = insertelement <32 x i8> %vec0D, i8 %elt0E, i64 14
+  %vec0F = insertelement <32 x i8> %vec0E, i8 %elt0F, i64 15
+  %vec10 = insertelement <32 x i8> %vec0F, i8 %elt10, i64 16
+  %vec11 = insertelement <32 x i8> %vec10, i8 %elt11, i64 17
+  %vec12 = insertelement <32 x i8> %vec11, i8 %elt12, i64 18
+  %vec13 = insertelement <32 x i8> %vec12, i8 %elt13, i64 19
+  %vec14 = insertelement <32 x i8> %vec13, i8 %elt14, i64 20
+  %vec15 = insertelement <32 x i8> %vec14, i8 %elt15, i64 21
+  %vec16 = insertelement <32 x i8> %vec15, i8 %elt16, i64 22
+  %vec17 = insertelement <32 x i8> %vec16, i8 %elt17, i64 23
+  %vec18 = insertelement <32 x i8> %vec17, i8 %elt18, i64 24
+  %vec19 = insertelement <32 x i8> %vec18, i8 %elt19, i64 25
+  %vec1A = insertelement <32 x i8> %vec19, i8 %elt1A, i64 26
+  %vec1B = insertelement <32 x i8> %vec1A, i8 %elt1B, i64 27
+  %vec1C = insertelement <32 x i8> %vec1B, i8 %elt1C, i64 28
+  %vec1D = insertelement <32 x i8> %vec1C, i8 %elt1D, i64 29
+  %vec1E = insertelement <32 x i8> %vec1D, i8 %elt1E, i64 30
+  %vec1F = insertelement <32 x i8> %vec1E, i8 %elt1F, i64 31
+  %res = select <32 x i1> %cmp, <32 x i8> zeroinitializer, <32 x i8> %vec1F
+  ret <32 x i8> %res
+}
+
 define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) nounwind {
 ; XOP-LABEL: var_shuffle_v4f64:
 ; XOP:       # %bb.0:
@@ -498,6 +1054,111 @@ define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) noun
   ret <4 x double> %ret3
 }
 
+define <4 x double> @var_shuffle_zero_v4f64(<4 x double> %v, <4 x i64> %indices) nounwind {
+; XOP-LABEL: var_shuffle_zero_v4f64:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [3,3]
+; XOP-NEXT:    vpcomgtuq %xmm3, %xmm2, %xmm2
+; XOP-NEXT:    vpcomgtuq %xmm3, %xmm1, %xmm3
+; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; XOP-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm3
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; XOP-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; XOP-NEXT:    vpermil2pd $0, %ymm1, %ymm3, %ymm0, %ymm0
+; XOP-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v4f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [9223372036854775811,9223372036854775811]
+; AVX1-NEXT:    # xmm4 = mem[0,0]
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vpermilpd %ymm1, %ymm3, %ymm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vblendvpd %ymm4, %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v4f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775811,9223372036854775811,9223372036854775811,9223372036854775811]
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [2,2,2,2]
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm1, %ymm3
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[2,3,2,3]
+; AVX2-NEXT:    vpermilpd %ymm1, %ymm4, %ymm4
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shuffle_zero_v4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [3,3,3,3]
+; AVX512-NEXT:    vpcmpleuq %zmm2, %zmm1, %k1
+; AVX512-NEXT:    vpcmpnleuq %zmm2, %zmm1, %k2
+; AVX512-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k2}
+; AVX512-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v4f64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vmovdqa64 %ymm2, %ymm1 {%k1}
+; AVX512VL-NEXT:    vpermq %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovapd %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <4 x i64> %indices, <i64 3, i64 3, i64 3, i64 3>
+  %or = select <4 x i1> %cmp, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> %indices
+  %idx0 = extractelement <4 x i64> %or, i64 0
+  %idx1 = extractelement <4 x i64> %or, i64 1
+  %idx2 = extractelement <4 x i64> %or, i64 2
+  %idx3 = extractelement <4 x i64> %or, i64 3
+  %elt0 = extractelement <4 x double> %v, i64 %idx0
+  %elt1 = extractelement <4 x double> %v, i64 %idx1
+  %elt2 = extractelement <4 x double> %v, i64 %idx2
+  %elt3 = extractelement <4 x double> %v, i64 %idx3
+  %vec0 = insertelement <4 x double> poison, double %elt0, i64 0
+  %vec1 = insertelement <4 x double> %vec0, double %elt1, i64 1
+  %vec2 = insertelement <4 x double> %vec1, double %elt2, i64 2
+  %vec3 = insertelement <4 x double> %vec2, double %elt3, i64 3
+  %res = select <4 x i1> %cmp, <4 x double> zeroinitializer, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
 define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwind {
 ; XOP-LABEL: var_shuffle_v8f32:
 ; XOP:       # %bb.0:
@@ -550,6 +1211,104 @@ define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwi
   ret <8 x float> %ret7
 }
 
+define <8 x float> @var_shuffle_zero_v8f32(<8 x float> %v, <8 x i32> %indices) nounwind {
+; XOP-LABEL: var_shuffle_zero_v8f32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOP-NEXT:    vbroadcastss {{.*#+}} xmm3 = [7,7,7,7]
+; XOP-NEXT:    vpcomgtud %xmm3, %xmm2, %xmm2
+; XOP-NEXT:    vpcomgtud %xmm3, %xmm1, %xmm3
+; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; XOP-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; XOP-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; XOP-NEXT:    vpermil2ps $0, %ymm1, %ymm3, %ymm0, %ymm0
+; XOP-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; XOP-NEXT:    retq
+;
+; AVX1-LABEL: var_shuffle_zero_v8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [8,8,8,8]
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vpermilps %ymm1, %ymm4, %ymm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vblendvps %ymm3, %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_shuffle_zero_v8f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8]
+; AVX2-NEXT:    vpmaxud %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: var_shuffle_zero_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
+; AVX512-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
+; AVX512-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shuffle_zero_v8f32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %k1
+; AVX512VL-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vmovdqa32 %ymm2, %ymm1 {%k1}
+; AVX512VL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovaps %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = icmp ugt <8 x i32> %indices, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %or = select <8 x i1> %cmp, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %indices
+  %idx0 = extractelement <8 x i32> %or, i64 0
+  %idx1 = extractelement <8 x i32> %or, i64 1
+  %idx2 = extractelement <8 x i32> %or, i64 2
+  %idx3 = extractelement <8 x i32> %or, i64 3
+  %idx4 = extractelement <8 x i32> %or, i64 4
+  %idx5 = extractelement <8 x i32> %or, i64 5
+  %idx6 = extractelement <8 x i32> %or, i64 6
+  %idx7 = extractelement <8 x i32> %or, i64 7
+  %elt0 = extractelement <8 x float> %v, i32 %idx0
+  %elt1 = extractelement <8 x float> %v, i32 %idx1
+  %elt2 = extractelement <8 x float> %v, i32 %idx2
+  %elt3 = extractelement <8 x float> %v, i32 %idx3
+  %elt4 = extractelement <8 x float> %v, i32 %idx4
+  %elt5 = extractelement <8 x float> %v, i32 %idx5
+  %elt6 = extractelement <8 x float> %v, i32 %idx6
+  %elt7 = extractelement <8 x float> %v, i32 %idx7
+  %vec0 = insertelement <8 x float> poison, float %elt0, i64 0
+  %vec1 = insertelement <8 x float> %vec0, float %elt1, i64 1
+  %vec2 = insertelement <8 x float> %vec1, float %elt2, i64 2
+  %vec3 = insertelement <8 x float> %vec2, float %elt3, i64 3
+  %vec4 = insertelement <8 x float> %vec3, float %elt4, i64 4
+  %vec5 = insertelement <8 x float> %vec4, float %elt5, i64 5
+  %vec6 = insertelement <8 x float> %vec5, float %elt6, i64 6
+  %vec7 = insertelement <8 x float> %vec6, float %elt7, i64 7
+  %res = select <8 x i1> %cmp, <8 x float> zeroinitializer, <8 x float> %vec7
+  ret <8 x float> %res
+}
+
 ;
 ; PR35820 - Unequal source/destination vector sizes
 ;

>From de8c4bef3329ce0045b8167cdb96d53b2a1509df Mon Sep 17 00:00:00 2001
From: Kai Luo <lkail at cn.ibm.com>
Date: Wed, 24 Jul 2024 21:30:12 +0800
Subject: [PATCH 005/259] [tsan] Include unistd.h to declare gettid()

Try to fix https://lab.llvm.org/buildbot/#/builders/176/builds/14400.
---
 compiler-rt/test/tsan/debug_alloc_stack.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compiler-rt/test/tsan/debug_alloc_stack.cpp b/compiler-rt/test/tsan/debug_alloc_stack.cpp
index 8fbcf232a1ac7..f45caf3735244 100644
--- a/compiler-rt/test/tsan/debug_alloc_stack.cpp
+++ b/compiler-rt/test/tsan/debug_alloc_stack.cpp
@@ -6,6 +6,7 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
 
 #ifndef __APPLE__
 #include <sys/types.h>

>From 1f5807eb359fb172575e18bb063fd85e4dc836e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski at arm.com>
Date: Wed, 24 Jul 2024 14:40:19 +0100
Subject: [PATCH 006/259] [mlir][vector][nfc] Simplify `in_bounds` attr update
 (#100334)

Since the `in_bounds` attribute is mandatory, there's no need for logic
like this (`readOp.getInBounds()` is guaranteed to return a non-empty
ArrayRef):

```cpp
ArrayAttr inBoundsAttr = readOp.getInBounds()
    ? rewriter.getArrayAttr( readOp.getInBoundsAttr().getValue().drop_back(dimsToDrop))
    : ArrayAttr();
```

Instead, we can do this:
```cpp
ArrayAttr inBoundsAttr = rewriter.getArrayAttr(
    readOp.getInBoundsAttr().getValue().drop_back(dimsToDrop));
```

This is a small follow-up for #97049 - this change should've been
included there.
---
 .../Dialect/Vector/Transforms/VectorTransforms.cpp | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index 2686277bba59d..6777e589795c8 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -1321,11 +1321,8 @@ class DropInnerMostUnitDimsTransferRead
         cast<MemRefType>(memref::SubViewOp::inferRankReducedResultType(
             srcType.getShape().drop_back(dimsToDrop), srcType, offsets, sizes,
             strides));
-    ArrayAttr inBoundsAttr =
-        readOp.getInBounds()
-            ? rewriter.getArrayAttr(
-                  readOp.getInBoundsAttr().getValue().drop_back(dimsToDrop))
-            : ArrayAttr();
+    ArrayAttr inBoundsAttr = rewriter.getArrayAttr(
+        readOp.getInBoundsAttr().getValue().drop_back(dimsToDrop));
     Value rankedReducedView = rewriter.create<memref::SubViewOp>(
         loc, resultMemrefType, readOp.getSource(), offsets, sizes, strides);
     auto permMap = getTransferMinorIdentityMap(
@@ -1415,11 +1412,8 @@ class DropInnerMostUnitDimsTransferWrite
         cast<MemRefType>(memref::SubViewOp::inferRankReducedResultType(
             srcType.getShape().drop_back(dimsToDrop), srcType, offsets, sizes,
             strides));
-    ArrayAttr inBoundsAttr =
-        writeOp.getInBounds()
-            ? rewriter.getArrayAttr(
-                  writeOp.getInBoundsAttr().getValue().drop_back(dimsToDrop))
-            : ArrayAttr();
+    ArrayAttr inBoundsAttr = rewriter.getArrayAttr(
+        writeOp.getInBoundsAttr().getValue().drop_back(dimsToDrop));
 
     Value rankedReducedView = rewriter.create<memref::SubViewOp>(
         loc, resultMemrefType, writeOp.getSource(), offsets, sizes, strides);

>From 5b15d9c441810121c23f9f421bbb007fd4c448e8 Mon Sep 17 00:00:00 2001
From: Jan Leyonberg <jan_sjodin at yahoo.com>
Date: Wed, 24 Jul 2024 09:57:39 -0400
Subject: [PATCH 007/259] [clang][OpenMP] Propoagate debug location to
 OMPIRBuilder reduction codegen (#100358)

This patch propagates the debug location from Clang to the
OpenMPIRBuilder.

Fixes https://github.com/llvm/llvm-project/issues/97458
---
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index f5bd4a141cc2d..8965a14d88a6f 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1695,7 +1695,8 @@ void CGOpenMPRuntimeGPU::emitReduction(
                          CGF.AllocaInsertPt->getIterator());
   InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
                           CGF.Builder.GetInsertPoint());
-  llvm::OpenMPIRBuilder::LocationDescription OmpLoc(CodeGenIP);
+  llvm::OpenMPIRBuilder::LocationDescription OmpLoc(
+      CodeGenIP, CGF.SourceLocToDebugLoc(Loc));
   llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos;
 
   CodeGenFunction::OMPPrivateScope Scope(CGF);

>From 557a7b8ae319527110ddaac93c896b2c0e6c824c Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty at gmail.com>
Date: Wed, 24 Jul 2024 15:58:18 +0200
Subject: [PATCH 008/259] [libc][math][c23] Fix totalorder and totalordermag
 smoke tests (#100354)

FPBits::signaling_nan() defaults to setting the MSB of the payload to 1.
The tests also used signaling NaNs with a payload of 0x123. With
float16, the 1 in 0x123 aligns to the MSB of the payload, therefore
0x123 is greater than the default payload. However, that is not the case
with more precise floating-point types.
---
 libc/test/src/math/smoke/TotalOrderMagTest.h | 24 ++++++++++----------
 libc/test/src/math/smoke/TotalOrderTest.h    | 24 ++++++++++----------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/libc/test/src/math/smoke/TotalOrderMagTest.h b/libc/test/src/math/smoke/TotalOrderMagTest.h
index 5fe2983a0e678..3d7f24fc74d10 100644
--- a/libc/test/src/math/smoke/TotalOrderMagTest.h
+++ b/libc/test/src/math/smoke/TotalOrderMagTest.h
@@ -104,24 +104,24 @@ class TotalOrderMagTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
   }
 
   void testNaNPayloads(TotalOrderMagFunc func) {
-    T qnan_123 = FPBits::quiet_nan(Sign::POS, 0x123).get_val();
-    T neg_qnan_123 = FPBits::quiet_nan(Sign::NEG, 0x123).get_val();
-    T snan_123 = FPBits::signaling_nan(Sign::POS, 0x123).get_val();
-    T neg_snan_123 = FPBits::signaling_nan(Sign::NEG, 0x123).get_val();
+    T qnan_0x42 = FPBits::quiet_nan(Sign::POS, 0x42).get_val();
+    T neg_qnan_0x42 = FPBits::quiet_nan(Sign::NEG, 0x42).get_val();
+    T snan_0x42 = FPBits::signaling_nan(Sign::POS, 0x42).get_val();
+    T neg_snan_0x42 = FPBits::signaling_nan(Sign::NEG, 0x42).get_val();
 
     EXPECT_TRUE(funcWrapper(func, aNaN, aNaN));
     EXPECT_TRUE(funcWrapper(func, sNaN, sNaN));
-    EXPECT_TRUE(funcWrapper(func, aNaN, qnan_123));
-    EXPECT_TRUE(funcWrapper(func, sNaN, snan_123));
-    EXPECT_FALSE(funcWrapper(func, qnan_123, aNaN));
-    EXPECT_FALSE(funcWrapper(func, snan_123, sNaN));
+    EXPECT_TRUE(funcWrapper(func, aNaN, qnan_0x42));
+    EXPECT_FALSE(funcWrapper(func, sNaN, snan_0x42));
+    EXPECT_FALSE(funcWrapper(func, qnan_0x42, aNaN));
+    EXPECT_TRUE(funcWrapper(func, snan_0x42, sNaN));
 
     EXPECT_TRUE(funcWrapper(func, neg_aNaN, neg_aNaN));
     EXPECT_TRUE(funcWrapper(func, neg_sNaN, neg_sNaN));
-    EXPECT_TRUE(funcWrapper(func, neg_aNaN, neg_qnan_123));
-    EXPECT_TRUE(funcWrapper(func, neg_sNaN, neg_snan_123));
-    EXPECT_FALSE(funcWrapper(func, neg_qnan_123, neg_aNaN));
-    EXPECT_FALSE(funcWrapper(func, neg_snan_123, neg_sNaN));
+    EXPECT_TRUE(funcWrapper(func, neg_aNaN, neg_qnan_0x42));
+    EXPECT_FALSE(funcWrapper(func, neg_sNaN, neg_snan_0x42));
+    EXPECT_FALSE(funcWrapper(func, neg_qnan_0x42, neg_aNaN));
+    EXPECT_TRUE(funcWrapper(func, neg_snan_0x42, neg_sNaN));
   }
 };
 
diff --git a/libc/test/src/math/smoke/TotalOrderTest.h b/libc/test/src/math/smoke/TotalOrderTest.h
index 281b2a59f930d..4d4257d089daf 100644
--- a/libc/test/src/math/smoke/TotalOrderTest.h
+++ b/libc/test/src/math/smoke/TotalOrderTest.h
@@ -102,24 +102,24 @@ class TotalOrderTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
   }
 
   void testNaNPayloads(TotalOrderFunc func) {
-    T qnan_123 = FPBits::quiet_nan(Sign::POS, 0x123).get_val();
-    T neg_qnan_123 = FPBits::quiet_nan(Sign::NEG, 0x123).get_val();
-    T snan_123 = FPBits::signaling_nan(Sign::POS, 0x123).get_val();
-    T neg_snan_123 = FPBits::signaling_nan(Sign::NEG, 0x123).get_val();
+    T qnan_0x42 = FPBits::quiet_nan(Sign::POS, 0x42).get_val();
+    T neg_qnan_0x42 = FPBits::quiet_nan(Sign::NEG, 0x42).get_val();
+    T snan_0x42 = FPBits::signaling_nan(Sign::POS, 0x42).get_val();
+    T neg_snan_0x42 = FPBits::signaling_nan(Sign::NEG, 0x42).get_val();
 
     EXPECT_TRUE(funcWrapper(func, aNaN, aNaN));
     EXPECT_TRUE(funcWrapper(func, sNaN, sNaN));
-    EXPECT_TRUE(funcWrapper(func, aNaN, qnan_123));
-    EXPECT_TRUE(funcWrapper(func, sNaN, snan_123));
-    EXPECT_FALSE(funcWrapper(func, qnan_123, aNaN));
-    EXPECT_FALSE(funcWrapper(func, snan_123, sNaN));
+    EXPECT_TRUE(funcWrapper(func, aNaN, qnan_0x42));
+    EXPECT_FALSE(funcWrapper(func, sNaN, snan_0x42));
+    EXPECT_FALSE(funcWrapper(func, qnan_0x42, aNaN));
+    EXPECT_TRUE(funcWrapper(func, snan_0x42, sNaN));
 
     EXPECT_TRUE(funcWrapper(func, neg_aNaN, neg_aNaN));
     EXPECT_TRUE(funcWrapper(func, neg_sNaN, neg_sNaN));
-    EXPECT_FALSE(funcWrapper(func, neg_aNaN, neg_qnan_123));
-    EXPECT_FALSE(funcWrapper(func, neg_sNaN, neg_snan_123));
-    EXPECT_TRUE(funcWrapper(func, neg_qnan_123, neg_aNaN));
-    EXPECT_TRUE(funcWrapper(func, neg_snan_123, neg_sNaN));
+    EXPECT_FALSE(funcWrapper(func, neg_aNaN, neg_qnan_0x42));
+    EXPECT_TRUE(funcWrapper(func, neg_sNaN, neg_snan_0x42));
+    EXPECT_TRUE(funcWrapper(func, neg_qnan_0x42, neg_aNaN));
+    EXPECT_FALSE(funcWrapper(func, neg_snan_0x42, neg_sNaN));
   }
 };
 

>From 1feef92a775daf8818faf766e0b1332421b48c5f Mon Sep 17 00:00:00 2001
From: Utkarsh Saxena <usx at google.com>
Date: Wed, 24 Jul 2024 15:58:52 +0200
Subject: [PATCH 009/259] Fix lifetimebound for field access (#100197)

Fixes: https://github.com/llvm/llvm-project/issues/81589

There is no way to switch this off without  `-Wno-dangling`.
---
 clang/docs/ReleaseNotes.rst               |  2 ++
 clang/lib/Sema/CheckExprLifetime.cpp      |  9 ++++++++
 clang/test/SemaCXX/attr-lifetimebound.cpp | 26 +++++++++++++++++++++++
 3 files changed, 37 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index f52514a7aa3a2..c80c2ebbac9a7 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -124,6 +124,8 @@ Improvements to Clang's diagnostics
       template <typename> int i; // error: non-static data member 'i' cannot be declared as a template
      };
 
+- Clang now diagnoses dangling references to fields of temporary objects. Fixes #GH81589.
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index 5c8ef564f30aa..112cf3d081822 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CheckExprLifetime.h"
+#include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Sema/Initialization.h"
@@ -548,6 +549,14 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path,
                                        EnableLifetimeWarnings);
   }
 
+  if (auto *M = dyn_cast<MemberExpr>(Init)) {
+    // Lifetime of a non-reference type field is same as base object.
+    if (auto *F = dyn_cast<FieldDecl>(M->getMemberDecl());
+        F && !F->getType()->isReferenceType())
+      visitLocalsRetainedByInitializer(Path, M->getBase(), Visit, true,
+                                       EnableLifetimeWarnings);
+  }
+
   if (isa<CallExpr>(Init)) {
     if (EnableLifetimeWarnings)
       handleGslAnnotatedTypes(Path, Init, Visit);
diff --git a/clang/test/SemaCXX/attr-lifetimebound.cpp b/clang/test/SemaCXX/attr-lifetimebound.cpp
index 70bc545c07bd9..7db0a4d64d259 100644
--- a/clang/test/SemaCXX/attr-lifetimebound.cpp
+++ b/clang/test/SemaCXX/attr-lifetimebound.cpp
@@ -47,6 +47,31 @@ namespace usage_ok {
     q = A(); // expected-warning {{object backing the pointer q will be destroyed at the end of the full-expression}}
     r = A(1); // expected-warning {{object backing the pointer r will be destroyed at the end of the full-expression}}
   }
+
+  struct FieldCheck {
+    struct Set {
+      int a;
+    };
+    struct Pair {
+      const int& a;
+      int b;
+      Set c;
+      int * d;
+    };
+    Pair p;  
+    FieldCheck(const int& a): p(a){}
+    Pair& getR() [[clang::lifetimebound]] { return p; }
+    Pair* getP() [[clang::lifetimebound]] { return &p; }
+    Pair* getNoLB() { return &p; }
+  };
+  void test_field_access() {
+    int x = 0;
+    const int& a = FieldCheck{x}.getR().a;
+    const int& b = FieldCheck{x}.getP()->b;   // expected-warning {{temporary bound to local reference 'b' will be destroyed at the end of the full-expression}}
+    const int& c = FieldCheck{x}.getP()->c.a; // expected-warning {{temporary bound to local reference 'c' will be destroyed at the end of the full-expression}}
+    const int& d = FieldCheck{x}.getNoLB()->c.a;
+    const int* e = FieldCheck{x}.getR().d;
+  }
 }
 
 # 1 "<std>" 1 3
@@ -239,3 +264,4 @@ namespace move_forward_et_al_examples {
   S X;
   S *AddressOfOk = std::addressof(X);
 } // namespace move_forward_et_al_examples
+

>From 83ea7ce3a132001dfd7665c7eb50ef44cfd79802 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Wed, 24 Jul 2024 07:15:10 -0700
Subject: [PATCH 010/259] [BOLT][NFC] Track fragment relationships using
 EquivalenceClasses

Three-way splitting can create references between split fragments (warm
to cold or vice versa) that are not handled by
`isChildOf/isParentOf/isChildOrParentOf`. Generalize fragment
relationships to allow checking if two functions belong to one group,
potentially in presence of ICF which can join multiple groups.

Test Plan: NFC for existing tests

Reviewers: maksfb, ayermolo, rafaelauler, dcci

Reviewed By: rafaelauler

Pull Request: https://github.com/llvm/llvm-project/pull/99979
---
 bolt/include/bolt/Core/BinaryContext.h  | 15 ++++++++++++++-
 bolt/include/bolt/Core/BinaryFunction.h | 10 ----------
 bolt/lib/Core/BinaryContext.cpp         |  9 +++++----
 bolt/lib/Core/Exceptions.cpp            |  2 +-
 4 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index b3cf9f834cc08..5fb32a1ea6784 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -23,6 +23,7 @@
 #include "bolt/RuntimeLibs/RuntimeLibrary.h"
 #include "llvm/ADT/AddressRanges.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/BinaryFormat/Dwarf.h"
@@ -241,6 +242,10 @@ class BinaryContext {
   /// Function fragments to skip.
   std::unordered_set<BinaryFunction *> FragmentsToSkip;
 
+  /// Fragment equivalence classes to query belonging to the same "family" in
+  /// presence of multiple fragments/multiple parents.
+  EquivalenceClasses<const BinaryFunction *> FragmentClasses;
+
   /// The runtime library.
   std::unique_ptr<RuntimeLibrary> RtLibrary;
 
@@ -1032,7 +1037,15 @@ class BinaryContext {
   ///   fragment_name == parent_name.cold(.\d+)?
   /// True if the Function is registered, false if the check failed.
   bool registerFragment(BinaryFunction &TargetFunction,
-                        BinaryFunction &Function) const;
+                        BinaryFunction &Function);
+
+  /// Return true if two functions belong to the same "family": are fragments
+  /// of one another, or fragments of the same parent, or transitively fragment-
+  /// related.
+  bool areRelatedFragments(const BinaryFunction *LHS,
+                           const BinaryFunction *RHS) const {
+    return FragmentClasses.isEquivalent(LHS, RHS);
+  }
 
   /// Add interprocedural reference for \p Function to \p Address
   void addInterproceduralReference(BinaryFunction *Function, uint64_t Address) {
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index da3fc433b7a3b..24c7db2f5d69c 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -1793,11 +1793,6 @@ class BinaryFunction {
     return ParentFragments.contains(&Other);
   }
 
-  /// Returns if this function is a parent of \p Other function.
-  bool isParentOf(const BinaryFunction &Other) const {
-    return Fragments.contains(&Other);
-  }
-
   /// Return the child fragment form parent function
   iterator_range<FragmentsSetTy::const_iterator> getFragments() const {
     return iterator_range<FragmentsSetTy::const_iterator>(Fragments.begin(),
@@ -1807,11 +1802,6 @@ class BinaryFunction {
   /// Return the parent function for split function fragments.
   FragmentsSetTy *getParentFragments() { return &ParentFragments; }
 
-  /// Returns if this function is a parent or child of \p Other function.
-  bool isParentOrChildOf(const BinaryFunction &Other) const {
-    return isChildOf(Other) || isParentOf(Other);
-  }
-
   /// Set the profile data for the number of times the function was called.
   BinaryFunction &setExecutionCount(uint64_t Count) {
     ExecutionCount = Count;
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 6a1106f23e485..fd7bce106db13 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -646,7 +646,7 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address,
     const BinaryFunction *TargetBF = getBinaryFunctionContainingAddress(Value);
     const bool DoesBelongToFunction =
         BF.containsAddress(Value) ||
-        (TargetBF && TargetBF->isParentOrChildOf(BF));
+        (TargetBF && areRelatedFragments(TargetBF, &BF));
     if (!DoesBelongToFunction) {
       LLVM_DEBUG({
         if (!BF.containsAddress(Value)) {
@@ -841,7 +841,7 @@ BinaryContext::getOrCreateJumpTable(BinaryFunction &Function, uint64_t Address,
     // Prevent associating a jump table to a specific fragment twice.
     // This simple check arises from the assumption: no more than 2 fragments.
     if (JT->Parents.size() == 1 && JT->Parents[0] != &Function) {
-      assert(JT->Parents[0]->isParentOrChildOf(Function) &&
+      assert(areRelatedFragments(JT->Parents[0], &Function) &&
              "cannot re-use jump table of a different function");
       // Duplicate the entry for the parent function for easy access
       JT->Parents.push_back(&Function);
@@ -1209,12 +1209,13 @@ void BinaryContext::generateSymbolHashes() {
 }
 
 bool BinaryContext::registerFragment(BinaryFunction &TargetFunction,
-                                     BinaryFunction &Function) const {
+                                     BinaryFunction &Function) {
   assert(TargetFunction.isFragment() && "TargetFunction must be a fragment");
   if (TargetFunction.isChildOf(Function))
     return true;
   TargetFunction.addParentFragment(Function);
   Function.addFragment(TargetFunction);
+  FragmentClasses.unionSets(&TargetFunction, &Function);
   if (!HasRelocations) {
     TargetFunction.setSimple(false);
     Function.setSimple(false);
@@ -1336,7 +1337,7 @@ void BinaryContext::processInterproceduralReferences() {
 
     if (TargetFunction) {
       if (TargetFunction->isFragment() &&
-          !TargetFunction->isChildOf(Function)) {
+          !areRelatedFragments(TargetFunction, &Function)) {
         this->errs()
             << "BOLT-WARNING: interprocedural reference between unrelated "
                "fragments: "
diff --git a/bolt/lib/Core/Exceptions.cpp b/bolt/lib/Core/Exceptions.cpp
index 82bddf76d5b8c..6a46a49a983d8 100644
--- a/bolt/lib/Core/Exceptions.cpp
+++ b/bolt/lib/Core/Exceptions.cpp
@@ -207,7 +207,7 @@ Error BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
                "BOLT-ERROR: cannot find landing pad fragment");
         BC.addInterproceduralReference(this, Fragment->getAddress());
         BC.processInterproceduralReferences();
-        assert(isParentOrChildOf(*Fragment) &&
+        assert(BC.areRelatedFragments(this, Fragment) &&
                "BOLT-ERROR: cannot have landing pads in different functions");
         setHasIndirectTargetToSplitFragment(true);
         BC.addFragmentsToSkip(this);

>From 9d2dd009b60793f562ba1b00e3d02773f060e4b6 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Wed, 24 Jul 2024 07:16:39 -0700
Subject: [PATCH 011/259] [BOLT] Support more than two jump table parents

Multi-way splitting can cause multiple fragments to access the same jump
table. Relax the assumption that a jump table can only have up to two
parents.

Test Plan: added bolt/test/X86/three-way-split-jt.s

Reviewers: ayermolo, dcci, rafaelauler, maksfb

Reviewed By: rafaelauler, dcci

Pull Request: https://github.com/llvm/llvm-project/pull/99988
---
 bolt/lib/Core/BinaryContext.cpp    | 12 ++--
 bolt/test/X86/three-way-split-jt.s | 95 ++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+), 5 deletions(-)
 create mode 100644 bolt/test/X86/three-way-split-jt.s

diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index fd7bce106db13..e86075e69c05d 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -839,9 +839,11 @@ BinaryContext::getOrCreateJumpTable(BinaryFunction &Function, uint64_t Address,
     assert(Address == JT->getAddress() && "unexpected non-empty jump table");
 
     // Prevent associating a jump table to a specific fragment twice.
-    // This simple check arises from the assumption: no more than 2 fragments.
-    if (JT->Parents.size() == 1 && JT->Parents[0] != &Function) {
-      assert(areRelatedFragments(JT->Parents[0], &Function) &&
+    if (!llvm::is_contained(JT->Parents, &Function)) {
+      assert(llvm::all_of(JT->Parents,
+                          [&](const BinaryFunction *BF) {
+                            return areRelatedFragments(&Function, BF);
+                          }) &&
              "cannot re-use jump table of a different function");
       // Duplicate the entry for the parent function for easy access
       JT->Parents.push_back(&Function);
@@ -852,8 +854,8 @@ BinaryContext::getOrCreateJumpTable(BinaryFunction &Function, uint64_t Address,
         JT->print(this->outs());
       }
       Function.JumpTables.emplace(Address, JT);
-      JT->Parents[0]->setHasIndirectTargetToSplitFragment(true);
-      JT->Parents[1]->setHasIndirectTargetToSplitFragment(true);
+      for (BinaryFunction *Parent : JT->Parents)
+        Parent->setHasIndirectTargetToSplitFragment(true);
     }
 
     bool IsJumpTableParent = false;
diff --git a/bolt/test/X86/three-way-split-jt.s b/bolt/test/X86/three-way-split-jt.s
new file mode 100644
index 0000000000000..c4ddc1b9905d7
--- /dev/null
+++ b/bolt/test/X86/three-way-split-jt.s
@@ -0,0 +1,95 @@
+## This reproduces an issue where the function is split into three fragments
+## and all fragments access the same jump table.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.out -v=1 -print-only=main.warm -print-cfg  2>&1 | FileCheck %s
+
+# CHECK-DAG: BOLT-INFO: marking main.warm as a fragment of main
+# CHECK-DAG: BOLT-INFO: marking main.cold as a fragment of main
+# CHECK-DAG: BOLT-INFO: processing main.warm as a sibling of non-ignored function
+# CHECK-DAG: BOLT-INFO: processing main.cold as a sibling of non-ignored function
+# CHECK-DAG: BOLT-WARNING: Ignoring main.cold
+# CHECK-DAG: BOLT-WARNING: Ignoring main.warm
+# CHECK-DAG: BOLT-WARNING: Ignoring main
+# CHECK: BOLT-WARNING: skipped 3 functions due to cold fragments
+
+# CHECK: PIC Jump table JUMP_TABLE for function main, main.warm, main.cold
+# CHECK-NEXT:   0x0000 : __ENTRY_main at 0x[[#]]
+# CHECK-NEXT:   0x0004 : __ENTRY_main at 0x[[#]]
+# CHECK-NEXT:   0x0008 : __ENTRY_main.cold at 0x[[#]]
+# CHECK-NEXT:   0x000c : __ENTRY_main at 0x[[#]]
+  .globl main
+  .type main, %function
+  .p2align 2
+main:
+LBB0:
+  andl $0xf, %ecx
+  cmpb $0x4, %cl
+  ## exit through ret
+  ja LBB3
+
+## jump table dispatch, jumping to label indexed by val in %ecx
+LBB1:
+  leaq JUMP_TABLE(%rip), %r8
+  movzbl %cl, %ecx
+  movslq (%r8,%rcx,4), %rax
+  addq %rax, %r8
+  jmpq *%r8
+
+LBB2:
+  xorq %rax, %rax
+LBB3:
+  addq $0x8, %rsp
+  ret
+.size main, .-main
+
+  .globl main.warm
+  .type main.warm, %function
+  .p2align 2
+main.warm:
+LBB20:
+  andl $0xb, %ebx
+  cmpb $0x1, %cl
+  # exit through ret
+  ja LBB23
+
+## jump table dispatch, jumping to label indexed by val in %ecx
+LBB21:
+  leaq JUMP_TABLE(%rip), %r8
+  movzbl %cl, %ecx
+  movslq (%r8,%rcx,4), %rax
+  addq %rax, %r8
+  jmpq *%r8
+
+LBB22:
+  xorq %rax, %rax
+LBB23:
+  addq $0x8, %rsp
+  ret
+.size main.warm, .-main.warm
+
+## cold fragment is only reachable through jump table
+  .globl main.cold
+  .type main.cold, %function
+main.cold:
+  leaq JUMP_TABLE(%rip), %r8
+  movzbl %cl, %ecx
+  movslq (%r8,%rcx,4), %rax
+  addq %rax, %r8
+  jmpq *%r8
+LBB4:
+  callq abort
+.size main.cold, .-main.cold
+
+  .rodata
+## jmp table, entries must be R_X86_64_PC32 relocs
+  .globl JUMP_TABLE
+JUMP_TABLE:
+  .long LBB2-JUMP_TABLE
+  .long LBB3-JUMP_TABLE
+  .long LBB4-JUMP_TABLE
+  .long LBB3-JUMP_TABLE

>From c4b66bf4d065d3bbc2e2fac8512a6df8e013c704 Mon Sep 17 00:00:00 2001
From: Carlos Seo <carlos.seo at linaro.org>
Date: Wed, 24 Jul 2024 11:18:08 -0300
Subject: [PATCH 012/259] [AArch64] Implement INIT/ADJUST_TRAMPOLINE (#70267)

Add support for llvm.init.trampoline and llvm.adjust.trampoline
intrinsics for AArch64.

Fixes https://github.com/llvm/llvm-project/issues/65573
Fixes https://github.com/llvm/llvm-project/issues/76927
Fixes https://github.com/llvm/llvm-project/issues/83555
Updates https://github.com/llvm/llvm-project/pull/66157
---
 compiler-rt/lib/builtins/README.txt           |  5 ++
 compiler-rt/lib/builtins/trampoline_setup.c   | 42 ++++++++++++++
 .../builtins/Unit/trampoline_setup_test.c     |  2 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 58 +++++++++++++++++++
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  2 +
 llvm/test/CodeGen/AArch64/trampoline.ll       | 19 ++++++
 6 files changed, 127 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/trampoline.ll

diff --git a/compiler-rt/lib/builtins/README.txt b/compiler-rt/lib/builtins/README.txt
index 2d213d95f333a..19f26c92a0f94 100644
--- a/compiler-rt/lib/builtins/README.txt
+++ b/compiler-rt/lib/builtins/README.txt
@@ -272,6 +272,11 @@ switch32
 switch8
 switchu8
 
+// This function generates a custom trampoline function with the specific
+// realFunc and localsPtr values.
+void __trampoline_setup(uint32_t* trampOnStack, int trampSizeAllocated,
+                        const void* realFunc, void* localsPtr);
+
 // There is no C interface to the *_vfp_d8_d15_regs functions.  There are
 // called in the prolog and epilog of Thumb1 functions.  When the C++ ABI use
 // SJLJ for exceptions, each function with a catch clause or destructors needs
diff --git a/compiler-rt/lib/builtins/trampoline_setup.c b/compiler-rt/lib/builtins/trampoline_setup.c
index 844eb27944142..830e25e4c0303 100644
--- a/compiler-rt/lib/builtins/trampoline_setup.c
+++ b/compiler-rt/lib/builtins/trampoline_setup.c
@@ -41,3 +41,45 @@ COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
   __clear_cache(trampOnStack, &trampOnStack[10]);
 }
 #endif // __powerpc__ && !defined(__powerpc64__)
+
+// The AArch64 compiler generates calls to __trampoline_setup() when creating
+// trampoline functions on the stack for use with nested functions.
+// This function creates a custom 36-byte trampoline function on the stack
+// which loads x18 with a pointer to the outer function's locals
+// and then jumps to the target nested function.
+// Note: x18 is a reserved platform register on Windows and macOS.
+
+#if defined(__aarch64__) && defined(__ELF__)
+COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
+                                        int trampSizeAllocated,
+                                        const void *realFunc, void *localsPtr) {
+  // This should never happen, but if compiler did not allocate
+  // enough space on stack for the trampoline, abort.
+  if (trampSizeAllocated < 36)
+    compilerrt_abort();
+
+  // create trampoline
+  // Load realFunc into x17. mov/movk 16 bits at a time.
+  trampOnStack[0] =
+      0xd2800000u | ((((uint64_t)realFunc >> 0) & 0xffffu) << 5) | 0x11;
+  trampOnStack[1] =
+      0xf2a00000u | ((((uint64_t)realFunc >> 16) & 0xffffu) << 5) | 0x11;
+  trampOnStack[2] =
+      0xf2c00000u | ((((uint64_t)realFunc >> 32) & 0xffffu) << 5) | 0x11;
+  trampOnStack[3] =
+      0xf2e00000u | ((((uint64_t)realFunc >> 48) & 0xffffu) << 5) | 0x11;
+  // Load localsPtr into x18
+  trampOnStack[4] =
+      0xd2800000u | ((((uint64_t)localsPtr >> 0) & 0xffffu) << 5) | 0x12;
+  trampOnStack[5] =
+      0xf2a00000u | ((((uint64_t)localsPtr >> 16) & 0xffffu) << 5) | 0x12;
+  trampOnStack[6] =
+      0xf2c00000u | ((((uint64_t)localsPtr >> 32) & 0xffffu) << 5) | 0x12;
+  trampOnStack[7] =
+      0xf2e00000u | ((((uint64_t)localsPtr >> 48) & 0xffffu) << 5) | 0x12;
+  trampOnStack[8] = 0xd61f0220; // br x17
+
+  // Clear instruction cache.
+  __clear_cache(trampOnStack, &trampOnStack[9]);
+}
+#endif // defined(__aarch64__) && !defined(__APPLE__) && !defined(_WIN64)
diff --git a/compiler-rt/test/builtins/Unit/trampoline_setup_test.c b/compiler-rt/test/builtins/Unit/trampoline_setup_test.c
index da115fe764271..d51d35acaa02f 100644
--- a/compiler-rt/test/builtins/Unit/trampoline_setup_test.c
+++ b/compiler-rt/test/builtins/Unit/trampoline_setup_test.c
@@ -7,7 +7,7 @@
 
 /*
  * Tests nested functions
- * The ppc compiler generates a call to __trampoline_setup
+ * The ppc and aarch64 compilers generates a call to __trampoline_setup
  * The i386 and x86_64 compilers generate a call to ___enable_execute_stack
  */
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7efe99c63a374..d86e52d49000a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1080,6 +1080,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   // Try to create BICs for vector ANDs.
   setTargetDAGCombine(ISD::AND);
 
+  // llvm.init.trampoline and llvm.adjust.trampoline
+  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
+
   // Vector add and sub nodes may conceal a high-half opportunity.
   // Also, try to fold ADD into CSINC/CSINV..
   setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP,
@@ -6688,6 +6692,56 @@ static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) {
   return Final;
 }
 
+SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
+  if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
+    report_fatal_error(
+        "ADJUST_TRAMPOLINE operation is only supported on Linux.");
+
+  return Op.getOperand(0);
+}
+
+SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+
+  // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
+  if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
+    report_fatal_error("INIT_TRAMPOLINE operation is only supported on Linux.");
+
+  SDValue Chain = Op.getOperand(0);
+  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDValue FPtr = Op.getOperand(2); // nested function
+  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+  SDLoc dl(Op);
+
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  Entry.Ty = IntPtrTy;
+  Entry.Node = Trmp;
+  Args.push_back(Entry);
+  Entry.Node = DAG.getConstant(20, dl, MVT::i64);
+  Args.push_back(Entry);
+
+  Entry.Node = FPtr;
+  Args.push_back(Entry);
+  Entry.Node = Nest;
+  Args.push_back(Entry);
+
+  // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
+      CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+      DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
+
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  return CallResult.second;
+}
+
 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
                                               SelectionDAG &DAG) const {
   LLVM_DEBUG(dbgs() << "Custom lowering: ");
@@ -6705,6 +6759,10 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerGlobalTLSAddress(Op, DAG);
   case ISD::PtrAuthGlobalAddress:
     return LowerPtrAuthGlobalAddress(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE:
+    return LowerADJUST_TRAMPOLINE(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:
+    return LowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::SETCC:
   case ISD::STRICT_FSETCC:
   case ISD::STRICT_FSETCCS:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index ef45e4f01ecd3..81e15185f985d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1143,6 +1143,8 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS,
                          SDValue TVal, SDValue FVal, const SDLoc &dl,
                          SelectionDAG &DAG) const;
+  SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRIND(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AArch64/trampoline.ll b/llvm/test/CodeGen/AArch64/trampoline.ll
new file mode 100644
index 0000000000000..293e538a7459d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/trampoline.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=aarch64-- < %s | FileCheck %s
+
+declare void @llvm.init.trampoline(ptr, ptr, ptr);
+declare ptr @llvm.adjust.trampoline(ptr);
+
+define i64 @f(ptr nest %c, i64 %x, i64 %y) {
+  %sum = add i64 %x, %y
+  ret i64 %sum
+}
+
+define i64 @main() {
+  %val = alloca i64
+  %nval = bitcast ptr %val to ptr
+  %tramp = alloca [36 x i8], align 8
+  ; CHECK:	bl	__trampoline_setup
+  call void @llvm.init.trampoline(ptr %tramp, ptr @f, ptr %nval)
+  %fp = call ptr @llvm.adjust.trampoline(ptr %tramp)
+  ret i64 0
+}

>From 6c23f974d6fe43d8e5c0adbc4a4f89fbabc9d9ea Mon Sep 17 00:00:00 2001
From: "Mikhail R. Gadelha" <mikhail at igalia.com>
Date: Wed, 24 Jul 2024 16:26:15 +0200
Subject: [PATCH 013/259] [libc] Enable bind test for riscv (#100132)

The bind test was failing in the rv32 build bot because of how the build bot was set to run the tests: we were using shared directories between the host and qemu and the bind function was trying to create a file in this directory, thus creating it in the host machine.

The OS was returning "-1 ENXIO (No such device or address)", so we changed the rv32 buildbot to copy the binaries to qemu and dropped the shared directories feature.
---
 libc/config/linux/riscv/entrypoints.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 6ab90771802fd..597aeb361afe4 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -856,6 +856,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.sys.select.select
 
     # sys/socket.h entrypoints
+    libc.src.sys.socket.bind
     libc.src.sys.socket.socket
   )
 endif()

>From 4d19676de417fa4dbfac07fc4fe0c30c232b3e68 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Wed, 24 Jul 2024 07:31:01 -0700
Subject: [PATCH 014/259] [BOLT] Add profile-use-pseudo-probes option

Move pseudo probe profile generation under --profile-use-pseudo-probes
option. Note that updating pseudo probes is independent from this flag.

Test Plan: updated pseudoprobe-decoding-inline.test

Reviewers: maksfb, rafaelauler, ayermolo, dcci, WenleiHe

Reviewed By: WenleiHe

Pull Request: https://github.com/llvm/llvm-project/pull/100299
---
 bolt/lib/Profile/DataAggregator.cpp            |  4 +++-
 bolt/lib/Profile/YAMLProfileReader.cpp         |  5 +++++
 bolt/lib/Profile/YAMLProfileWriter.cpp         |  4 +++-
 bolt/lib/Rewrite/PseudoProbeRewriter.cpp       |  6 +++++-
 bolt/test/X86/pseudoprobe-decoding-inline.test | 12 ++++++++++--
 5 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index c9668bddd0d7e..a300e5b2b1dab 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -88,6 +88,7 @@ MaxSamples("max-samples",
   cl::cat(AggregatorCategory));
 
 extern cl::opt<opts::ProfileFormatKind> ProfileFormat;
+extern cl::opt<bool> ProfileUsePseudoProbes;
 extern cl::opt<std::string> SaveProfile;
 
 cl::opt<bool> ReadPreAggregated(
@@ -2298,7 +2299,8 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
 
   yaml::bolt::BinaryProfile BP;
 
-  const MCPseudoProbeDecoder *PseudoProbeDecoder = BC.getPseudoProbeDecoder();
+  const MCPseudoProbeDecoder *PseudoProbeDecoder =
+      opts::ProfileUsePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr;
 
   // Fill out the header info.
   BP.Header.Version = 1;
diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp
index 604a9fb4813be..3eca5e972fa5b 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -49,6 +49,11 @@ llvm::cl::opt<bool>
 llvm::cl::opt<bool> ProfileUseDFS("profile-use-dfs",
                                   cl::desc("use DFS order for YAML profile"),
                                   cl::Hidden, cl::cat(BoltOptCategory));
+
+llvm::cl::opt<bool> ProfileUsePseudoProbes(
+    "profile-use-pseudo-probes",
+    cl::desc("Use pseudo probes for profile generation and matching"),
+    cl::Hidden, cl::cat(BoltOptCategory));
 } // namespace opts
 
 namespace llvm {
diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp
index b9fcb5df29fdb..84777741d611a 100644
--- a/bolt/lib/Profile/YAMLProfileWriter.cpp
+++ b/bolt/lib/Profile/YAMLProfileWriter.cpp
@@ -22,6 +22,7 @@
 
 namespace opts {
 extern llvm::cl::opt<bool> ProfileUseDFS;
+extern llvm::cl::opt<bool> ProfileUsePseudoProbes;
 } // namespace opts
 
 namespace llvm {
@@ -57,7 +58,8 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
                            const BoltAddressTranslation *BAT) {
   yaml::bolt::BinaryFunctionProfile YamlBF;
   const BinaryContext &BC = BF.getBinaryContext();
-  const MCPseudoProbeDecoder *PseudoProbeDecoder = BC.getPseudoProbeDecoder();
+  const MCPseudoProbeDecoder *PseudoProbeDecoder =
+      opts::ProfileUsePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr;
 
   const uint16_t LBRProfile = BF.getProfileFlags() & BinaryFunction::PF_LBR;
 
diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index 3704a9ba452b9..886bbdbf9d686 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -49,6 +49,7 @@ static cl::opt<PrintPseudoProbesOptions> PrintPseudoProbes(
                clEnumValN(PPP_All, "all", "enable all debugging printout")),
     cl::Hidden, cl::cat(BoltCategory));
 
+extern cl::opt<bool> ProfileUsePseudoProbes;
 } // namespace opts
 
 namespace {
@@ -89,12 +90,15 @@ class PseudoProbeRewriter final : public MetadataRewriter {
 };
 
 Error PseudoProbeRewriter::preCFGInitializer() {
-  parsePseudoProbe();
+  if (opts::ProfileUsePseudoProbes)
+    parsePseudoProbe();
 
   return Error::success();
 }
 
 Error PseudoProbeRewriter::postEmitFinalizer() {
+  if (!opts::ProfileUsePseudoProbes)
+    parsePseudoProbe();
   updatePseudoProbes();
 
   return Error::success();
diff --git a/bolt/test/X86/pseudoprobe-decoding-inline.test b/bolt/test/X86/pseudoprobe-decoding-inline.test
index 924331e34c76a..b361551e5711e 100644
--- a/bolt/test/X86/pseudoprobe-decoding-inline.test
+++ b/bolt/test/X86/pseudoprobe-decoding-inline.test
@@ -6,11 +6,11 @@
 # PREAGG: B X:0 #main# 1 0
 ## Check pseudo-probes in regular YAML profile (non-BOLTed binary)
 # RUN: link_fdata %s %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin %t.preagg PREAGG
-# RUN: perf2bolt %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin -p %t.preagg --pa -w %t.yaml -o %t.fdata
+# RUN: perf2bolt %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin -p %t.preagg --pa -w %t.yaml -o %t.fdata --profile-use-pseudo-probes
 # RUN: FileCheck --input-file %t.yaml %s --check-prefix CHECK-YAML
 ## Check pseudo-probes in BAT YAML profile (BOLTed binary)
 # RUN: link_fdata %s %t.bolt %t.preagg2 PREAGG
-# RUN: perf2bolt %t.bolt -p %t.preagg2 --pa -w %t.yaml2 -o %t.fdata2
+# RUN: perf2bolt %t.bolt -p %t.preagg2 --pa -w %t.yaml2 -o %t.fdata2 --profile-use-pseudo-probes
 # RUN: FileCheck --input-file %t.yaml2 %s --check-prefix CHECK-YAML
 # CHECK-YAML: name: bar
 # CHECK-YAML: - bid: 0
@@ -29,6 +29,14 @@
 # CHECK-YAML:   pseudo_probes: [ { guid: 0xDB956436E78DD5FA, id: 1, type: 0 }, { guid: 0x5CF8C24CDB18BDAC, id: 1, type: 0 }, { guid: 0x5CF8C24CDB18BDAC, id: 2, type: 0 } ]
 # CHECK-YAML: guid: 0xDB956436E78DD5FA
 # CHECK-YAML: pseudo_probe_desc_hash: 0x10000FFFFFFFF
+#
+## Check that without --profile-use-pseudo-probes option, no pseudo probes are
+## generated
+# RUN: perf2bolt %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin -p %t.preagg --pa -w %t.yaml -o %t.fdata
+# RUN: FileCheck --input-file %t.yaml %s --check-prefix CHECK-NO-OPT
+# CHECK-NO-OPT-NOT: pseudo_probes
+# CHECK-NO-OPT-NOT: guid
+# CHECK-NO-OPT-NOT: pseudo_probe_desc_hash
 
 CHECK: Report of decoding input pseudo probe binaries
 

>From f2b6f0b5868d052ef03431405b1053be18a36d75 Mon Sep 17 00:00:00 2001
From: Mitch Phillips <mitchp at google.com>
Date: Wed, 24 Jul 2024 16:38:42 +0200
Subject: [PATCH 015/259] Revert "[tsan] Enable test on linux (#99659)"

This reverts commit 52ebd8d0577e07be9c25b56656d1974a84c40bf7.

Missing declaration for gettid() on the buildbot
(https://lab.llvm.org/buildbot/#/builders/72/builds/1592) and unfortunately
a fix-forward attempt for unistd.h didn't work
(de8c4bef3329ce0045b8167cdb96d53b2a1509df). Reverting for now.
---
 compiler-rt/test/tsan/debug_alloc_stack.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/compiler-rt/test/tsan/debug_alloc_stack.cpp b/compiler-rt/test/tsan/debug_alloc_stack.cpp
index f45caf3735244..dfb2882fb6ea2 100644
--- a/compiler-rt/test/tsan/debug_alloc_stack.cpp
+++ b/compiler-rt/test/tsan/debug_alloc_stack.cpp
@@ -1,6 +1,9 @@
 // RUN: %clangxx_tsan -O0 %s -o %t
 // RUN: env %env_tsan_opts=stack_trace_format=DEFAULT %deflake %run %t 2>&1 | FileCheck %s
 
+// Until I figure out how to make this test work on Linux
+// REQUIRES: system-darwin
+
 #include "test.h"
 #include <pthread.h>
 #include <stdint.h>

>From 374bbc93bcb34268f175405a71806c4580495c6d Mon Sep 17 00:00:00 2001
From: Mitch Phillips <mitchp at google.com>
Date: Wed, 24 Jul 2024 16:39:25 +0200
Subject: [PATCH 016/259] Revert "[tsan] Include unistd.h to declare gettid()"

This reverts commit de8c4bef3329ce0045b8167cdb96d53b2a1509df.

Unfortunately the fix-forward didn't work. Reverting both and will have
Vitaly re-submit.
---
 compiler-rt/test/tsan/debug_alloc_stack.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/compiler-rt/test/tsan/debug_alloc_stack.cpp b/compiler-rt/test/tsan/debug_alloc_stack.cpp
index dfb2882fb6ea2..70e9759f0af63 100644
--- a/compiler-rt/test/tsan/debug_alloc_stack.cpp
+++ b/compiler-rt/test/tsan/debug_alloc_stack.cpp
@@ -9,7 +9,6 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <unistd.h>
 
 #ifndef __APPLE__
 #include <sys/types.h>

>From f83950ab8dfda1da882a6ef7b508639df251621a Mon Sep 17 00:00:00 2001
From: Angel Zhang <angel.zhang at amd.com>
Date: Wed, 24 Jul 2024 10:41:19 -0400
Subject: [PATCH 017/259] [mlir][spirv] Implement vector unrolling for
 `convert-to-spirv` pass (#100138)

### Description
This PR builds on #99872. It implements a minimal version of function
body vector unrolling to convert vector types into 1D and with a size
supported by SPIR-V (2, 3 or 4 depending on the original dimension). The
ops that are currently supported include those with elementwise traits
(e.g. `arith.addi`), `vector.reduction` and `vector.transpose`. This PR
also includes new LIT tests that only check for vector unrolling.

### Future Plans
- Support more ops

---------

Co-authored-by: Jakub Kuderski <kubakuderski at gmail.com>
---
 mlir/include/mlir/Conversion/Passes.td        |   5 +-
 .../SPIRV/Transforms/SPIRVConversion.h        |  21 +++
 .../ConvertToSPIRV/ConvertToSPIRVPass.cpp     |  21 ++-
 .../SPIRV/Transforms/SPIRVConversion.cpp      | 139 ++++++++++++++++--
 .../test/Conversion/ConvertToSPIRV/arith.mlir |   2 +-
 .../Conversion/ConvertToSPIRV/combined.mlir   |   2 +-
 .../func-signature-vector-unroll.mlir         |  44 ++++++
 .../test/Conversion/ConvertToSPIRV/index.mlir |   2 +-
 mlir/test/Conversion/ConvertToSPIRV/scf.mlir  |   2 +-
 .../Conversion/ConvertToSPIRV/simple.mlir     |   2 +-
 mlir/test/Conversion/ConvertToSPIRV/ub.mlir   |   2 +-
 .../ConvertToSPIRV/vector-unroll.mlir         | 102 +++++++++++++
 .../Conversion/ConvertToSPIRV/vector.mlir     |   2 +-
 .../Conversion/ConvertToSPIRV/CMakeLists.txt  |   2 +
 .../TestSPIRVFuncSignatureConversion.cpp      |   9 +-
 .../TestSPIRVVectorUnrolling.cpp              |  50 +++++++
 mlir/tools/mlir-opt/mlir-opt.cpp              |   2 +
 17 files changed, 371 insertions(+), 38 deletions(-)
 create mode 100644 mlir/test/Conversion/ConvertToSPIRV/vector-unroll.mlir
 create mode 100644 mlir/test/lib/Conversion/ConvertToSPIRV/TestSPIRVVectorUnrolling.cpp

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 748646e605827..b5bb2f42f2961 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -47,7 +47,10 @@ def ConvertToSPIRVPass : Pass<"convert-to-spirv"> {
   let options = [
     Option<"runSignatureConversion", "run-signature-conversion", "bool",
     /*default=*/"true",
-    "Run function signature conversion to convert vector types">
+    "Run function signature conversion to convert vector types">,
+    Option<"runVectorUnrolling", "run-vector-unrolling", "bool",
+    /*default=*/"true",
+    "Run vector unrolling to convert vector types in function bodies">
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h b/mlir/include/mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h
index 9ad3d5fc85dd3..f54c93a09e727 100644
--- a/mlir/include/mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h
+++ b/mlir/include/mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h
@@ -19,8 +19,10 @@
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/OneToNTypeConversion.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/Support/LogicalResult.h"
 
 namespace mlir {
 
@@ -189,6 +191,25 @@ Value getVulkanElementPtr(const SPIRVTypeConverter &typeConverter,
                           MemRefType baseType, Value basePtr,
                           ValueRange indices, Location loc, OpBuilder &builder);
 
+// Find the largest factor of size among {2,3,4} for the lowest dimension of
+// the target shape.
+int getComputeVectorSize(int64_t size);
+
+// GetNativeVectorShape implementation for reduction ops.
+SmallVector<int64_t> getNativeVectorShapeImpl(vector::ReductionOp op);
+
+// GetNativeVectorShape implementation for transpose ops.
+SmallVector<int64_t> getNativeVectorShapeImpl(vector::TransposeOp op);
+
+// For general ops.
+std::optional<SmallVector<int64_t>> getNativeVectorShape(Operation *op);
+
+// Unroll vectors in function signatures to native size.
+LogicalResult unrollVectorsInSignatures(Operation *op);
+
+// Unroll vectors in function bodies to native size.
+LogicalResult unrollVectorsInFuncBodies(Operation *op);
+
 } // namespace spirv
 } // namespace mlir
 
diff --git a/mlir/lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp b/mlir/lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp
index 003a5feea9e9b..4694a147e1e94 100644
--- a/mlir/lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp
+++ b/mlir/lib/Conversion/ConvertToSPIRV/ConvertToSPIRVPass.cpp
@@ -17,6 +17,8 @@
 #include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
 #include "mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
@@ -42,19 +44,16 @@ struct ConvertToSPIRVPass final
   using ConvertToSPIRVPassBase::ConvertToSPIRVPassBase;
 
   void runOnOperation() override {
-    MLIRContext *context = &getContext();
     Operation *op = getOperation();
+    MLIRContext *context = &getContext();
 
-    if (runSignatureConversion) {
-      // Unroll vectors in function signatures to native vector size.
-      RewritePatternSet patterns(context);
-      populateFuncOpVectorRewritePatterns(patterns);
-      populateReturnOpVectorRewritePatterns(patterns);
-      GreedyRewriteConfig config;
-      config.strictMode = GreedyRewriteStrictness::ExistingOps;
-      if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns), config)))
-        return signalPassFailure();
-    }
+    // Unroll vectors in function signatures to native size.
+    if (runSignatureConversion && failed(spirv::unrollVectorsInSignatures(op)))
+      return signalPassFailure();
+
+    // Unroll vectors in function bodies to native size.
+    if (runVectorUnrolling && failed(spirv::unrollVectorsInFuncBodies(op)))
+      return signalPassFailure();
 
     spirv::TargetEnvAttr targetAttr = spirv::lookupTargetEnvOrDefault(op);
     std::unique_ptr<ConversionTarget> target =
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
index bf5044437fd09..d833ec9309baa 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
@@ -20,17 +20,21 @@
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/OneToNTypeConversion.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/MathExtras.h"
 
 #include <functional>
@@ -46,14 +50,6 @@ namespace {
 // Utility functions
 //===----------------------------------------------------------------------===//
 
-static int getComputeVectorSize(int64_t size) {
-  for (int i : {4, 3, 2}) {
-    if (size % i == 0)
-      return i;
-  }
-  return 1;
-}
-
 static std::optional<SmallVector<int64_t>> getTargetShape(VectorType vecType) {
   LLVM_DEBUG(llvm::dbgs() << "Get target shape\n");
   if (vecType.isScalable()) {
@@ -62,8 +58,8 @@ static std::optional<SmallVector<int64_t>> getTargetShape(VectorType vecType) {
     return std::nullopt;
   }
   SmallVector<int64_t> unrollShape = llvm::to_vector<4>(vecType.getShape());
-  std::optional<SmallVector<int64_t>> targetShape =
-      SmallVector<int64_t>(1, getComputeVectorSize(vecType.getShape().back()));
+  std::optional<SmallVector<int64_t>> targetShape = SmallVector<int64_t>(
+      1, mlir::spirv::getComputeVectorSize(vecType.getShape().back()));
   if (!targetShape) {
     LLVM_DEBUG(llvm::dbgs() << "--no unrolling target shape defined\n");
     return std::nullopt;
@@ -1098,13 +1094,20 @@ struct ReturnOpVectorUnroll final : OpRewritePattern<func::ReturnOp> {
       // the original operand of illegal type.
       auto originalShape =
           llvm::to_vector_of<int64_t, 4>(origVecType.getShape());
-      SmallVector<int64_t> strides(targetShape->size(), 1);
+      SmallVector<int64_t> strides(originalShape.size(), 1);
+      SmallVector<int64_t> extractShape(originalShape.size(), 1);
+      extractShape.back() = targetShape->back();
       SmallVector<Type> newTypes;
       Value returnValue = returnOp.getOperand(origResultNo);
       for (SmallVector<int64_t> offsets :
            StaticTileOffsetRange(originalShape, *targetShape)) {
         Value result = rewriter.create<vector::ExtractStridedSliceOp>(
-            loc, returnValue, offsets, *targetShape, strides);
+            loc, returnValue, offsets, extractShape, strides);
+        if (originalShape.size() > 1) {
+          SmallVector<int64_t> extractIndices(originalShape.size() - 1, 0);
+          result =
+              rewriter.create<vector::ExtractOp>(loc, result, extractIndices);
+        }
         newOperands.push_back(result);
         newTypes.push_back(unrolledType);
       }
@@ -1285,6 +1288,118 @@ Value mlir::spirv::getElementPtr(const SPIRVTypeConverter &typeConverter,
                              builder);
 }
 
+//===----------------------------------------------------------------------===//
+// Public functions for vector unrolling
+//===----------------------------------------------------------------------===//
+
+int mlir::spirv::getComputeVectorSize(int64_t size) {
+  for (int i : {4, 3, 2}) {
+    if (size % i == 0)
+      return i;
+  }
+  return 1;
+}
+
+SmallVector<int64_t>
+mlir::spirv::getNativeVectorShapeImpl(vector::ReductionOp op) {
+  VectorType srcVectorType = op.getSourceVectorType();
+  assert(srcVectorType.getRank() == 1); // Guaranteed by semantics
+  int64_t vectorSize =
+      mlir::spirv::getComputeVectorSize(srcVectorType.getDimSize(0));
+  return {vectorSize};
+}
+
+SmallVector<int64_t>
+mlir::spirv::getNativeVectorShapeImpl(vector::TransposeOp op) {
+  VectorType vectorType = op.getResultVectorType();
+  SmallVector<int64_t> nativeSize(vectorType.getRank(), 1);
+  nativeSize.back() =
+      mlir::spirv::getComputeVectorSize(vectorType.getShape().back());
+  return nativeSize;
+}
+
+std::optional<SmallVector<int64_t>>
+mlir::spirv::getNativeVectorShape(Operation *op) {
+  if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1) {
+    if (auto vecType = dyn_cast<VectorType>(op->getResultTypes()[0])) {
+      SmallVector<int64_t> nativeSize(vecType.getRank(), 1);
+      nativeSize.back() =
+          mlir::spirv::getComputeVectorSize(vecType.getShape().back());
+      return nativeSize;
+    }
+  }
+
+  return TypeSwitch<Operation *, std::optional<SmallVector<int64_t>>>(op)
+      .Case<vector::ReductionOp, vector::TransposeOp>(
+          [](auto typedOp) { return getNativeVectorShapeImpl(typedOp); })
+      .Default([](Operation *) { return std::nullopt; });
+}
+
+LogicalResult mlir::spirv::unrollVectorsInSignatures(Operation *op) {
+  MLIRContext *context = op->getContext();
+  RewritePatternSet patterns(context);
+  populateFuncOpVectorRewritePatterns(patterns);
+  populateReturnOpVectorRewritePatterns(patterns);
+  // We only want to apply signature conversion once to the existing func ops.
+  // Without specifying strictMode, the greedy pattern rewriter will keep
+  // looking for newly created func ops.
+  GreedyRewriteConfig config;
+  config.strictMode = GreedyRewriteStrictness::ExistingOps;
+  return applyPatternsAndFoldGreedily(op, std::move(patterns), config);
+}
+
+LogicalResult mlir::spirv::unrollVectorsInFuncBodies(Operation *op) {
+  MLIRContext *context = op->getContext();
+
+  // Unroll vectors in function bodies to native vector size.
+  {
+    RewritePatternSet patterns(context);
+    auto options = vector::UnrollVectorOptions().setNativeShapeFn(
+        [](auto op) { return mlir::spirv::getNativeVectorShape(op); });
+    populateVectorUnrollPatterns(patterns, options);
+    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
+      return failure();
+  }
+
+  // Convert transpose ops into extract and insert pairs, in preparation of
+  // further transformations to canonicalize/cancel.
+  {
+    RewritePatternSet patterns(context);
+    auto options = vector::VectorTransformsOptions().setVectorTransposeLowering(
+        vector::VectorTransposeLowering::EltWise);
+    vector::populateVectorTransposeLoweringPatterns(patterns, options);
+    vector::populateVectorShapeCastLoweringPatterns(patterns);
+    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
+      return failure();
+  }
+
+  // Run canonicalization to cast away leading size-1 dimensions.
+  {
+    RewritePatternSet patterns(context);
+
+    // We need to pull in casting way leading one dims.
+    vector::populateCastAwayVectorLeadingOneDimPatterns(patterns);
+    vector::ReductionOp::getCanonicalizationPatterns(patterns, context);
+    vector::TransposeOp::getCanonicalizationPatterns(patterns, context);
+
+    // Decompose different rank insert_strided_slice and n-D
+    // extract_slided_slice.
+    vector::populateVectorInsertExtractStridedSliceDecompositionPatterns(
+        patterns);
+    vector::InsertOp::getCanonicalizationPatterns(patterns, context);
+    vector::ExtractOp::getCanonicalizationPatterns(patterns, context);
+
+    // Trimming leading unit dims may generate broadcast/shape_cast ops. Clean
+    // them up.
+    vector::BroadcastOp::getCanonicalizationPatterns(patterns, context);
+    vector::ShapeCastOp::getCanonicalizationPatterns(patterns, context);
+
+    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
+      return failure();
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // SPIR-V TypeConverter
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/ConvertToSPIRV/arith.mlir b/mlir/test/Conversion/ConvertToSPIRV/arith.mlir
index 1a844a7cd018b..6418e931f7460 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/arith.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/arith.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false" -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false run-vector-unrolling=false" -split-input-file %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // arithmetic ops
diff --git a/mlir/test/Conversion/ConvertToSPIRV/combined.mlir b/mlir/test/Conversion/ConvertToSPIRV/combined.mlir
index 02b938be775a3..311174bef15ed 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/combined.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/combined.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false" %s | FileCheck %s
+// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false run-vector-unrolling=false" -split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @combined
 // CHECK: %[[C0_F32:.*]] = spirv.Constant 0.000000e+00 : f32
diff --git a/mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir b/mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir
index 347d282f9ee0c..c018ccb924983 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir
@@ -66,6 +66,28 @@ func.func @simple_vector_8(%arg0 : vector<8xi32>) -> vector<8xi32> {
 
 // -----
 
+// CHECK-LABEL: @simple_vector_2d
+// CHECK-SAME: (%[[ARG0:.+]]: vector<4xi32>, %[[ARG1:.+]]: vector<4xi32>, %[[ARG2:.+]]: vector<4xi32>, %[[ARG3:.+]]: vector<4xi32>)
+func.func @simple_vector_2d(%arg0 : vector<4x4xi32>) -> vector<4x4xi32> {
+  // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<4x4xi32>
+  // CHECK: %[[INSERT0:.*]] = vector.insert_strided_slice %[[ARG0]], %[[CST]] {offsets = [0, 0], strides = [1]} : vector<4xi32> into vector<4x4xi32>
+  // CHECK: %[[INSERT1:.*]] = vector.insert_strided_slice %[[ARG1]], %[[INSERT0]] {offsets = [1, 0], strides = [1]} : vector<4xi32> into vector<4x4xi32>
+  // CHECK: %[[INSERT2:.*]] = vector.insert_strided_slice %[[ARG2]], %[[INSERT1]] {offsets = [2, 0], strides = [1]} : vector<4xi32> into vector<4x4xi32>
+  // CHECK: %[[INSERT3:.*]] = vector.insert_strided_slice %[[ARG3]], %[[INSERT2]] {offsets = [3, 0], strides = [1]} : vector<4xi32> into vector<4x4xi32>
+  // CHECK: %[[EXTRACT0:.*]] = vector.extract_strided_slice %[[INSERT3]] {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
+  // CHECK: %[[EXTRACT0_1:.*]] = vector.extract %[[EXTRACT0]][0] : vector<4xi32> from vector<1x4xi32>
+  // CHECK: %[[EXTRACT1:.*]] = vector.extract_strided_slice %[[INSERT3]] {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
+  // CHECK: %[[EXTRACT1_1:.*]] = vector.extract %[[EXTRACT1]][0] : vector<4xi32> from vector<1x4xi32>
+  // CHECK: %[[EXTRACT2:.*]] = vector.extract_strided_slice %[[INSERT3]] {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
+  // CHECK: %[[EXTRACT2_1:.*]] = vector.extract %[[EXTRACT2]][0] : vector<4xi32> from vector<1x4xi32>
+  // CHECK: %[[EXTRACT3:.*]] = vector.extract_strided_slice %[[INSERT3]] {offsets = [3, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
+  // CHECK: %[[EXTRACT3_1:.*]] = vector.extract %[[EXTRACT3]][0] : vector<4xi32> from vector<1x4xi32>
+  // CHECK: return %[[EXTRACT0_1]], %[[EXTRACT1_1]], %[[EXTRACT2_1]], %[[EXTRACT3_1]] : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>
+  return %arg0 : vector<4x4xi32>
+}
+
+// -----
+
 // CHECK-LABEL: @vector_6and8
 // CHECK-SAME: (%[[ARG0:.+]]: vector<3xi32>, %[[ARG1:.+]]: vector<3xi32>, %[[ARG2:.+]]: vector<4xi32>, %[[ARG3:.+]]: vector<4xi32>)
 func.func @vector_6and8(%arg0 : vector<6xi32>, %arg1 : vector<8xi32>) -> (vector<6xi32>, vector<8xi32>) {
@@ -113,6 +135,28 @@ func.func @scalar_vector(%arg0 : vector<8xi32>, %arg1 : vector<3xi32>, %arg2 : i
 
 // -----
 
+// CHECK-LABEL: @vector_2dand1d
+// CHECK-SAME: (%[[ARG0:.+]]: vector<3xi32>, %[[ARG1:.+]]: vector<3xi32>, %[[ARG2:.+]]: vector<3xi32>, %[[ARG3:.+]]: vector<3xi32>, %[[ARG4:.+]]: vector<4xi32>)
+func.func @vector_2dand1d(%arg0 : vector<2x6xi32>, %arg1 : vector<4xi32>) -> (vector<2x6xi32>, vector<4xi32>) {
+  // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<2x6xi32>
+  // CHECK: %[[INSERT0:.*]] = vector.insert_strided_slice %[[ARG0]], %[[CST]] {offsets = [0, 0], strides = [1]} : vector<3xi32> into vector<2x6xi32>
+  // CHECK: %[[INSERT1:.*]] = vector.insert_strided_slice %[[ARG1]], %[[INSERT0]] {offsets = [0, 3], strides = [1]} : vector<3xi32> into vector<2x6xi32>
+  // CHECK: %[[INSERT2:.*]] = vector.insert_strided_slice %[[ARG2]], %[[INSERT1]] {offsets = [1, 0], strides = [1]} : vector<3xi32> into vector<2x6xi32>
+  // CHECK: %[[INSERT3:.*]] = vector.insert_strided_slice %[[ARG3]], %[[INSERT2]] {offsets = [1, 3], strides = [1]} : vector<3xi32> into vector<2x6xi32>
+  // CHECK: %[[EXTRACT0:.*]]  = vector.extract_strided_slice %[[INSERT3]] {offsets = [0, 0], sizes = [1, 3], strides = [1, 1]} : vector<2x6xi32> to vector<1x3xi32>
+  // CHECK: %[[EXTRACT0_1:.*]]  = vector.extract %[[EXTRACT0]][0] : vector<3xi32> from vector<1x3xi32>
+  // CHECK: %[[EXTRACT1:.*]]  = vector.extract_strided_slice %[[INSERT3]] {offsets = [0, 3], sizes = [1, 3], strides = [1, 1]} : vector<2x6xi32> to vector<1x3xi32>
+  // CHECK: %[[EXTRACT1_1:.*]]  = vector.extract %[[EXTRACT1]][0] : vector<3xi32> from vector<1x3xi32>
+  // CHECK: %[[EXTRACT2:.*]]  = vector.extract_strided_slice %[[INSERT3]] {offsets = [1, 0], sizes = [1, 3], strides = [1, 1]} : vector<2x6xi32> to vector<1x3xi32>
+  // CHECK: %[[EXTRACT2_1:.*]]  = vector.extract %[[EXTRACT2]][0] : vector<3xi32> from vector<1x3xi32>
+  // CHECK: %[[EXTRACT3:.*]]  = vector.extract_strided_slice %[[INSERT3]] {offsets = [1, 3], sizes = [1, 3], strides = [1, 1]} : vector<2x6xi32> to vector<1x3xi32>
+  // CHECK: %[[EXTRACT3_1:.*]]  = vector.extract %[[EXTRACT3]][0] : vector<3xi32> from vector<1x3xi32>
+  // CHECK: return %[[EXTRACT0_1]], %[[EXTRACT1_1]], %[[EXTRACT2_1]], %[[EXTRACT3_1]], %[[ARG4]] : vector<3xi32>, vector<3xi32>, vector<3xi32>, vector<3xi32>, vector<4xi32>
+  return %arg0, %arg1 : vector<2x6xi32>, vector<4xi32>
+}
+
+// -----
+
 // CHECK-LABEL: @reduction
 // CHECK-SAME: (%[[ARG0:.+]]: vector<4xi32>, %[[ARG1:.+]]: vector<4xi32>, %[[ARG2:.+]]: vector<4xi32>, %[[ARG3:.+]]: vector<4xi32>, %[[ARG4:.+]]: i32)
 func.func @reduction(%arg0 : vector<8xi32>, %arg1 : vector<8xi32>, %arg2 : i32) -> (i32) {
diff --git a/mlir/test/Conversion/ConvertToSPIRV/index.mlir b/mlir/test/Conversion/ConvertToSPIRV/index.mlir
index e1cb18aac5d01..f4b116849fa93 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/index.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/index.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-to-spirv="run-signature-conversion=false" | FileCheck %s
+// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false run-vector-unrolling=false" -split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @basic
 func.func @basic(%a: index, %b: index) {
diff --git a/mlir/test/Conversion/ConvertToSPIRV/scf.mlir b/mlir/test/Conversion/ConvertToSPIRV/scf.mlir
index 58ec6ac61f6ac..246464928b81c 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/scf.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/scf.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false" %s | FileCheck %s
+// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false run-vector-unrolling=false" -split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @if_yield
 // CHECK: %[[VAR:.*]] = spirv.Variable : !spirv.ptr<f32, Function>
diff --git a/mlir/test/Conversion/ConvertToSPIRV/simple.mlir b/mlir/test/Conversion/ConvertToSPIRV/simple.mlir
index c5e0e6603d94a..00556140c3018 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/simple.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/simple.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false" %s | FileCheck %s
+// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false run-vector-unrolling=false" -split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @return_scalar
 // CHECK-SAME: %[[ARG0:.*]]: i32
diff --git a/mlir/test/Conversion/ConvertToSPIRV/ub.mlir b/mlir/test/Conversion/ConvertToSPIRV/ub.mlir
index a83bfb6f405a0..f34ca01c94f00 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/ub.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/ub.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false" %s | FileCheck %s
+// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false run-vector-unrolling=false" -split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @ub
 // CHECK: %[[UNDEF:.*]] = spirv.Undef : i32
diff --git a/mlir/test/Conversion/ConvertToSPIRV/vector-unroll.mlir b/mlir/test/Conversion/ConvertToSPIRV/vector-unroll.mlir
new file mode 100644
index 0000000000000..043f9422d8790
--- /dev/null
+++ b/mlir/test/Conversion/ConvertToSPIRV/vector-unroll.mlir
@@ -0,0 +1,102 @@
+// RUN: mlir-opt -test-spirv-vector-unrolling -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: @vaddi
+// CHECK-SAME: (%[[ARG0:.+]]: vector<3xi32>, %[[ARG1:.+]]: vector<3xi32>, %[[ARG2:.+]]: vector<3xi32>, %[[ARG3:.+]]: vector<3xi32>)
+func.func @vaddi(%arg0 : vector<6xi32>, %arg1 : vector<6xi32>) -> (vector<6xi32>) {
+  // CHECK: %[[ADD0:.*]] = arith.addi %[[ARG0]], %[[ARG2]] : vector<3xi32>
+  // CHECK: %[[ADD1:.*]] = arith.addi %[[ARG1]], %[[ARG3]] : vector<3xi32>
+  // CHECK: return %[[ADD0]], %[[ADD1]] : vector<3xi32>, vector<3xi32>
+  %0 = arith.addi %arg0, %arg1 : vector<6xi32>
+  return %0 : vector<6xi32>
+}
+
+// CHECK-LABEL: @vaddi_2d
+// CHECK-SAME: (%[[ARG0:.+]]: vector<2xi32>, %[[ARG1:.+]]: vector<2xi32>, %[[ARG2:.+]]: vector<2xi32>, %[[ARG3:.+]]: vector<2xi32>)
+func.func @vaddi_2d(%arg0 : vector<2x2xi32>, %arg1 : vector<2x2xi32>) -> (vector<2x2xi32>) {
+  // CHECK: %[[ADD0:.*]] = arith.addi %[[ARG0]], %[[ARG2]] : vector<2xi32>
+  // CHECK: %[[ADD1:.*]] = arith.addi %[[ARG1]], %[[ARG3]] : vector<2xi32>
+  // CHECK: return %[[ADD0]], %[[ADD1]] : vector<2xi32>, vector<2xi32>
+  %0 = arith.addi %arg0, %arg1 : vector<2x2xi32>
+  return %0 : vector<2x2xi32>
+}
+
+// CHECK-LABEL: @vaddi_2d_8
+// CHECK-SAME: (%[[ARG0:.+]]: vector<4xi32>, %[[ARG1:.+]]: vector<4xi32>, %[[ARG2:.+]]: vector<4xi32>, %[[ARG3:.+]]: vector<4xi32>, %[[ARG4:.+]]: vector<4xi32>, %[[ARG5:.+]]: vector<4xi32>, %[[ARG6:.+]]: vector<4xi32>, %[[ARG7:.+]]: vector<4xi32>)
+func.func @vaddi_2d_8(%arg0 : vector<2x8xi32>, %arg1 : vector<2x8xi32>) -> (vector<2x8xi32>) {
+  // CHECK: %[[ADD0:.*]] = arith.addi %[[ARG0]], %[[ARG4]] : vector<4xi32>
+  // CHECK: %[[ADD1:.*]] = arith.addi %[[ARG1]], %[[ARG5]] : vector<4xi32>
+  // CHECK: %[[ADD2:.*]] = arith.addi %[[ARG2]], %[[ARG6]] : vector<4xi32>
+  // CHECK: %[[ADD3:.*]] = arith.addi %[[ARG3]], %[[ARG7]] : vector<4xi32>
+  // CHECK: return %[[ADD0]], %[[ADD1]], %[[ADD2]], %[[ADD3]] : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>
+  %0 = arith.addi %arg0, %arg1 : vector<2x8xi32>
+  return %0 : vector<2x8xi32>
+}
+
+// -----
+
+// CHECK-LABEL: @reduction_5
+// CHECK-SAME: (%[[ARG0:.+]]: vector<1xi32>, %[[ARG1:.+]]: vector<1xi32>, %[[ARG2:.+]]: vector<1xi32>, %[[ARG3:.+]]: vector<1xi32>, %[[ARG4:.+]]: vector<1xi32>)
+func.func @reduction_5(%arg0 : vector<5xi32>) -> (i32) {
+  // CHECK: %[[EXTRACT0:.*]] = vector.extract %[[ARG0]][0] : i32 from vector<1xi32>
+  // CHECK: %[[EXTRACT1:.*]] = vector.extract %[[ARG1]][0] : i32 from vector<1xi32>
+  // CHECK: %[[ADD0:.*]] = arith.addi %[[EXTRACT0]], %[[EXTRACT1]] : i32
+  // CHECK: %[[EXTRACT2:.*]] = vector.extract %[[ARG2]][0] : i32 from vector<1xi32>
+  // CHECK: %[[ADD1:.*]] = arith.addi %[[ADD0]], %[[EXTRACT2]] : i32
+  // CHECK: %[[EXTRACT3:.*]] = vector.extract %[[ARG3]][0] : i32 from vector<1xi32>
+  // CHECK: %[[ADD2:.*]] = arith.addi %[[ADD1]], %[[EXTRACT3]] : i32
+  // CHECK: %[[EXTRACT4:.*]] = vector.extract %[[ARG4]][0] : i32 from vector<1xi32>
+  // CHECK: %[[ADD3:.*]] = arith.addi %[[ADD2]], %[[EXTRACT4]] : i32
+  // CHECK: return %[[ADD3]] : i32
+  %0 = vector.reduction <add>, %arg0 : vector<5xi32> into i32
+  return %0 : i32
+}
+
+// CHECK-LABEL: @reduction_8
+// CHECK-SAME: (%[[ARG0:.+]]: vector<4xi32>, %[[ARG1:.+]]: vector<4xi32>)
+func.func @reduction_8(%arg0 : vector<8xi32>) -> (i32) {
+  // CHECK: %[[REDUCTION0:.*]] = vector.reduction <add>, %[[ARG0]] : vector<4xi32> into i32
+  // CHECK: %[[REDUCTION1:.*]] = vector.reduction <add>, %[[ARG1]] : vector<4xi32> into i32
+  // CHECK: %[[ADD:.*]] = arith.addi %[[REDUCTION0]], %[[REDUCTION1]] : i32
+  // CHECK: return %[[ADD]] : i32
+  %0 = vector.reduction <add>, %arg0 : vector<8xi32> into i32
+  return %0 : i32
+}
+
+// -----
+
+// CHECK-LABEL: @vaddi_reduction
+// CHECK-SAME: (%[[ARG0:.+]]: vector<4xi32>, %[[ARG1:.+]]: vector<4xi32>, %[[ARG2:.+]]: vector<4xi32>, %[[ARG3:.+]]: vector<4xi32>)
+func.func @vaddi_reduction(%arg0 : vector<8xi32>, %arg1 : vector<8xi32>) -> (i32) {
+  // CHECK: %[[ADD0:.*]] = arith.addi %[[ARG0]], %[[ARG2]] : vector<4xi32>
+  // CHECK: %[[ADD1:.*]] = arith.addi %[[ARG1]], %[[ARG3]] : vector<4xi32>
+  // CHECK: %[[REDUCTION0:.*]] = vector.reduction <add>, %[[ADD0]] : vector<4xi32> into i32
+  // CHECK: %[[REDUCTION1:.*]] = vector.reduction <add>, %[[ADD1]] : vector<4xi32> into i32
+  // CHECK: %[[ADD2:.*]] = arith.addi %[[REDUCTION0]], %[[REDUCTION1]] : i32
+  // CHECK: return %[[ADD2]] : i32
+  %0 = arith.addi %arg0, %arg1 : vector<8xi32>
+  %1 = vector.reduction <add>, %0 : vector<8xi32> into i32
+  return %1 : i32
+}
+
+// -----
+
+// CHECK-LABEL: @transpose
+// CHECK-SAME: (%[[ARG0:.+]]: vector<3xi32>, %[[ARG1:.+]]: vector<3xi32>)
+func.func @transpose(%arg0 : vector<2x3xi32>) -> (vector<3x2xi32>) {
+  // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<2xi32>
+  // CHECK: %[[EXTRACT0:.*]] = vector.extract %[[ARG0]][0] : i32 from vector<3xi32>
+  // CHECK: %[[INSERT0:.*]]= vector.insert %[[EXTRACT0]], %[[CST]] [0] : i32 into vector<2xi32>
+  // CHECK: %[[EXTRACT1:.*]] = vector.extract %[[ARG1]][0] : i32 from vector<3xi32>
+  // CHECK: %[[INSERT1:.*]] = vector.insert %[[EXTRACT1]], %[[INSERT0]][1] : i32 into vector<2xi32>
+  // CHECK: %[[EXTRACT2:.*]] = vector.extract %[[ARG0]][1] : i32 from vector<3xi32>
+  // CHECK: %[[INSERT2:.*]] = vector.insert %[[EXTRACT2]], %[[CST]] [0] : i32 into vector<2xi32>
+  // CHECK: %[[EXTRACT3:.*]] = vector.extract %[[ARG1]][1] : i32 from vector<3xi32>
+  // CHECK: %[[INSERT3:.*]] = vector.insert %[[EXTRACT3]], %[[INSERT2]] [1] : i32 into vector<2xi32>
+  // CHECK: %[[EXTRACT4:.*]] = vector.extract %[[ARG0]][2] : i32 from vector<3xi32>
+  // CHECK: %[[INSERT4:.*]] = vector.insert %[[EXTRACT4]], %[[CST]] [0] : i32 into vector<2xi32>
+  // CHECK: %[[EXTRACT5:.*]] = vector.extract %[[ARG1]][2] : i32 from vector<3xi32>
+  // CHECK: %[[INSERT5:.*]] = vector.insert %[[EXTRACT5]], %[[INSERT4]] [1] : i32 into vector<2xi32>
+  // CHECK: return %[[INSERT1]], %[[INSERT3]], %[[INSERT5]] : vector<2xi32>, vector<2xi32>, vector<2xi32>
+  %0 = vector.transpose %arg0, [1, 0] : vector<2x3xi32> to vector<3x2xi32>
+  return %0 : vector<3x2xi32>
+}
diff --git a/mlir/test/Conversion/ConvertToSPIRV/vector.mlir b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
index c63dd030f4747..e369eadca5730 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -split-input-file -convert-to-spirv="run-signature-conversion=false" %s | FileCheck %s
+// RUN: mlir-opt -convert-to-spirv="run-signature-conversion=false run-vector-unrolling=false" -split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @extract
 //  CHECK-SAME: %[[ARG:.+]]: vector<2xf32>
diff --git a/mlir/test/lib/Conversion/ConvertToSPIRV/CMakeLists.txt b/mlir/test/lib/Conversion/ConvertToSPIRV/CMakeLists.txt
index 69b5787f7e851..aeade52c7ade5 100644
--- a/mlir/test/lib/Conversion/ConvertToSPIRV/CMakeLists.txt
+++ b/mlir/test/lib/Conversion/ConvertToSPIRV/CMakeLists.txt
@@ -1,6 +1,7 @@
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRTestConvertToSPIRV
   TestSPIRVFuncSignatureConversion.cpp
+  TestSPIRVVectorUnrolling.cpp
 
   EXCLUDE_FROM_LIBMLIR
 
@@ -13,4 +14,5 @@ add_mlir_library(MLIRTestConvertToSPIRV
   MLIRTransformUtils
   MLIRTransforms
   MLIRVectorDialect
+  MLIRVectorTransforms
   )
diff --git a/mlir/test/lib/Conversion/ConvertToSPIRV/TestSPIRVFuncSignatureConversion.cpp b/mlir/test/lib/Conversion/ConvertToSPIRV/TestSPIRVFuncSignatureConversion.cpp
index ec67f85f6f27b..4a792336caba4 100644
--- a/mlir/test/lib/Conversion/ConvertToSPIRV/TestSPIRVFuncSignatureConversion.cpp
+++ b/mlir/test/lib/Conversion/ConvertToSPIRV/TestSPIRVFuncSignatureConversion.cpp
@@ -37,13 +37,8 @@ struct TestSPIRVFuncSignatureConversion final
   }
 
   void runOnOperation() override {
-    RewritePatternSet patterns(&getContext());
-    populateFuncOpVectorRewritePatterns(patterns);
-    populateReturnOpVectorRewritePatterns(patterns);
-    GreedyRewriteConfig config;
-    config.strictMode = GreedyRewriteStrictness::ExistingOps;
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns),
-                                       config);
+    Operation *op = getOperation();
+    (void)spirv::unrollVectorsInSignatures(op);
   }
 };
 
diff --git a/mlir/test/lib/Conversion/ConvertToSPIRV/TestSPIRVVectorUnrolling.cpp b/mlir/test/lib/Conversion/ConvertToSPIRV/TestSPIRVVectorUnrolling.cpp
new file mode 100644
index 0000000000000..0bad43d5214b1
--- /dev/null
+++ b/mlir/test/lib/Conversion/ConvertToSPIRV/TestSPIRVVectorUnrolling.cpp
@@ -0,0 +1,50 @@
+//===- TestSPIRVVectorUnrolling.cpp - Test signature conversion -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace {
+
+struct TestSPIRVVectorUnrolling final
+    : PassWrapper<TestSPIRVVectorUnrolling, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestSPIRVVectorUnrolling)
+
+  StringRef getArgument() const final { return "test-spirv-vector-unrolling"; }
+
+  StringRef getDescription() const final {
+    return "Test patterns that unroll vectors to types supported by SPIR-V";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<spirv::SPIRVDialect, vector::VectorDialect>();
+  }
+
+  void runOnOperation() override {
+    Operation *op = getOperation();
+    (void)spirv::unrollVectorsInSignatures(op);
+    (void)spirv::unrollVectorsInFuncBodies(op);
+  }
+};
+
+} // namespace
+
+namespace test {
+void registerTestSPIRVVectorUnrolling() {
+  PassRegistration<TestSPIRVVectorUnrolling>();
+}
+} // namespace test
+} // namespace mlir
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 149f9d59961b8..0f29963da39bb 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -142,6 +142,7 @@ void registerTestSCFWrapInZeroTripCheckPasses();
 void registerTestShapeMappingPass();
 void registerTestSliceAnalysisPass();
 void registerTestSPIRVFuncSignatureConversion();
+void registerTestSPIRVVectorUnrolling();
 void registerTestTensorCopyInsertionPass();
 void registerTestTensorTransforms();
 void registerTestTopologicalSortAnalysisPass();
@@ -275,6 +276,7 @@ void registerTestPasses() {
   mlir::test::registerTestShapeMappingPass();
   mlir::test::registerTestSliceAnalysisPass();
   mlir::test::registerTestSPIRVFuncSignatureConversion();
+  mlir::test::registerTestSPIRVVectorUnrolling();
   mlir::test::registerTestTensorCopyInsertionPass();
   mlir::test::registerTestTensorTransforms();
   mlir::test::registerTestTopologicalSortAnalysisPass();

>From e7a3aeb1f86b8d45815b834e4e20b1fc7e71705d Mon Sep 17 00:00:00 2001
From: Mitch Phillips <mitchp at google.com>
Date: Wed, 24 Jul 2024 16:44:25 +0200
Subject: [PATCH 018/259] Revert "[compiler-rt] Move `endif` to correct place
 (#100342)"

This reverts commit 558a8953680fd03bdd49a6708f3ea82d82328769.

This was a fix-forward for
https://github.com/llvm/llvm-project/pull/99613 that unfortunately
didn't work for the Android sanitizer buildbot. More information in that
pull request.
---
 compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index 8b1affb0a048d..bc2cb247f2a8a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -2237,7 +2237,6 @@ static const char *RegNumToRegName(int reg) {
     case 31:
       return "sp";
 #    endif
-#  endif  // SANITIZER_LINUX
     default:
       return NULL;
   }
@@ -2303,6 +2302,7 @@ static void DumpSingleReg(ucontext_t *ctx, int RegNum) {
   (void)RegName;
 #    endif
 }
+#  endif
 
 void SignalContext::DumpAllRegisters(void *context) {
   ucontext_t *ucontext = (ucontext_t *)context;

>From 59e1c6cd63eb9287df6516f9a5ae564075e9c218 Mon Sep 17 00:00:00 2001
From: Mitch Phillips <mitchp at google.com>
Date: Wed, 24 Jul 2024 16:45:01 +0200
Subject: [PATCH 019/259] Revert "[compiler-rt] Implement `DumpAllRegisters`
 for arm-linux and aarch64-linux (#99613)"

This reverts commit ef1c70d26b7e84a6f47c0c6a868b769935b2b008.

Unfortunately broke the sanitizer buildbot(s), and the fix-forward
didn't work. More details in
https://github.com/llvm/llvm-project/pull/99613
---
 .../lib/sanitizer_common/sanitizer_linux.cpp  | 142 +-----------------
 .../Linux/dump_registers_aarch64.cpp          |  23 ---
 .../TestCases/Linux/dump_registers_arm.cpp    |  19 ---
 3 files changed, 2 insertions(+), 182 deletions(-)
 delete mode 100644 compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_aarch64.cpp
 delete mode 100644 compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_arm.cpp

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index bc2cb247f2a8a..483a1042a6238 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -2172,118 +2172,15 @@ static const char *RegNumToRegName(int reg) {
       return "ebp";
     case REG_ESP:
       return "esp";
-#    elif defined(__arm__)
-#      define REG_STR(reg) #reg
-#      define MAKE_CASE(N) \
-        case REG_R##N:     \
-          return REG_STR(r##N)
-    MAKE_CASE(0);
-    MAKE_CASE(1);
-    MAKE_CASE(2);
-    MAKE_CASE(3);
-    MAKE_CASE(4);
-    MAKE_CASE(5);
-    MAKE_CASE(6);
-    MAKE_CASE(7);
-    MAKE_CASE(8);
-    MAKE_CASE(9);
-    MAKE_CASE(10);
-    MAKE_CASE(11);
-    MAKE_CASE(12);
-    case REG_R13:
-      return "sp";
-    case REG_R14:
-      return "lr";
-    case REG_R15:
-      return "pc";
-#    elif defined(__aarch64__)
-#      define REG_STR(reg) #reg
-#      define MAKE_CASE(N) \
-        case N:            \
-          return REG_STR(x##N)
-    MAKE_CASE(0);
-    MAKE_CASE(1);
-    MAKE_CASE(2);
-    MAKE_CASE(3);
-    MAKE_CASE(4);
-    MAKE_CASE(5);
-    MAKE_CASE(6);
-    MAKE_CASE(7);
-    MAKE_CASE(8);
-    MAKE_CASE(9);
-    MAKE_CASE(10);
-    MAKE_CASE(11);
-    MAKE_CASE(12);
-    MAKE_CASE(13);
-    MAKE_CASE(14);
-    MAKE_CASE(15);
-    MAKE_CASE(16);
-    MAKE_CASE(17);
-    MAKE_CASE(18);
-    MAKE_CASE(19);
-    MAKE_CASE(20);
-    MAKE_CASE(21);
-    MAKE_CASE(22);
-    MAKE_CASE(23);
-    MAKE_CASE(24);
-    MAKE_CASE(25);
-    MAKE_CASE(26);
-    MAKE_CASE(27);
-    MAKE_CASE(28);
-    case 29:
-      return "fp";
-    case 30:
-      return "lr";
-    case 31:
-      return "sp";
 #    endif
+#  endif
     default:
       return NULL;
   }
   return NULL;
 }
 
-#    if SANITIZER_LINUX && (defined(__arm__) || defined(__aarch64__))
-static uptr GetArmRegister(ucontext_t *ctx, int RegNum) {
-  switch (RegNum) {
-#      if defined(__arm__)
-#        define MAKE_CASE(N) \
-          case REG_R##N:     \
-            return ctx->uc_mcontext.arm_r##N
-    MAKE_CASE(0);
-    MAKE_CASE(1);
-    MAKE_CASE(2);
-    MAKE_CASE(3);
-    MAKE_CASE(4);
-    MAKE_CASE(5);
-    MAKE_CASE(6);
-    MAKE_CASE(7);
-    MAKE_CASE(8);
-    MAKE_CASE(9);
-    MAKE_CASE(10);
-    case REG_R11:
-      return ctx->uc_mcontext.arm_fp;
-    case REG_R12:
-      return ctx->uc_mcontext.arm_ip;
-    case REG_R13:
-      return ctx->uc_mcontext.arm_sp;
-    case REG_R14:
-      return ctx->uc_mcontext.arm_lr;
-    case REG_R15:
-      return ctx->uc_mcontext.arm_pc;
-#      elif defined(__aarch64__)
-    case 0 ... 30:
-      return ctx->uc_mcontext.regs[RegNum];
-    case 31:
-      return ctx->uc_mcontext.sp;
-#      endif
-    default:
-      return 0;
-  }
-  return 0;
-}
-#    endif  // SANITIZER_LINUX && (defined(__arm__) || defined(__aarch64__))
-
+#  if SANITIZER_LINUX
 UNUSED
 static void DumpSingleReg(ucontext_t *ctx, int RegNum) {
   const char *RegName = RegNumToRegName(RegNum);
@@ -2292,12 +2189,6 @@ static void DumpSingleReg(ucontext_t *ctx, int RegNum) {
          RegName, ctx->uc_mcontext.gregs[RegNum]);
 #    elif defined(__i386__)
   Printf("%s = 0x%08x  ", RegName, ctx->uc_mcontext.gregs[RegNum]);
-#    elif defined(__arm__)
-  Printf("%s%s = 0x%08zx  ", internal_strlen(RegName) == 2 ? " " : "", RegName,
-         GetArmRegister(ctx, RegNum));
-#    elif defined(__aarch64__)
-  Printf("%s%s = 0x%016zx  ", internal_strlen(RegName) == 2 ? " " : "", RegName,
-         GetArmRegister(ctx, RegNum));
 #    else
   (void)RegName;
 #    endif
@@ -2345,35 +2236,6 @@ void SignalContext::DumpAllRegisters(void *context) {
   DumpSingleReg(ucontext, REG_EBP);
   DumpSingleReg(ucontext, REG_ESP);
   Printf("\n");
-#    elif defined(__arm__)
-  Report("Register values:\n");
-  DumpSingleReg(ucontext, REG_R0);
-  DumpSingleReg(ucontext, REG_R1);
-  DumpSingleReg(ucontext, REG_R2);
-  DumpSingleReg(ucontext, REG_R3);
-  Printf("\n");
-  DumpSingleReg(ucontext, REG_R4);
-  DumpSingleReg(ucontext, REG_R5);
-  DumpSingleReg(ucontext, REG_R6);
-  DumpSingleReg(ucontext, REG_R7);
-  Printf("\n");
-  DumpSingleReg(ucontext, REG_R8);
-  DumpSingleReg(ucontext, REG_R9);
-  DumpSingleReg(ucontext, REG_R10);
-  DumpSingleReg(ucontext, REG_R11);
-  Printf("\n");
-  DumpSingleReg(ucontext, REG_R12);
-  DumpSingleReg(ucontext, REG_R13);
-  DumpSingleReg(ucontext, REG_R14);
-  DumpSingleReg(ucontext, REG_R15);
-  Printf("\n");
-#    elif defined(__aarch64__)
-  Report("Register values:\n");
-  for (int i = 0; i <= 31; ++i) {
-    DumpSingleReg(ucontext, i);
-    if (i % 4 == 3)
-      Printf("\n");
-  }
 #    else
   (void)ucontext;
 #    endif
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_aarch64.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_aarch64.cpp
deleted file mode 100644
index e01b826c86b8a..0000000000000
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_aarch64.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// Check that sanitizer prints registers dump_registers on dump_registers=1
-// RUN: %clangxx  %s -o %t
-// RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-NODUMP
-// RUN: not %run %t 2>&1 | FileCheck %s --strict-whitespace --check-prefix=CHECK-DUMP
-//
-// REQUIRES: aarch64-target-arch
-
-#include <signal.h>
-
-int main() {
-  raise(SIGSEGV);
-  // CHECK-DUMP: Register values
-  // CHECK-DUMP-NEXT: x0 = {{0x[0-9a-f]+}}   x1 = {{0x[0-9a-f]+}}   x2 = {{0x[0-9a-f]+}}   x3 = {{0x[0-9a-f]+}}
-  // CHECK-DUMP-NEXT: x4 = {{0x[0-9a-f]+}}   x5 = {{0x[0-9a-f]+}}   x6 = {{0x[0-9a-f]+}}   x7 = {{0x[0-9a-f]+}}
-  // CHECK-DUMP-NEXT: x8 = {{0x[0-9a-f]+}}   x9 = {{0x[0-9a-f]+}}  x10 = {{0x[0-9a-f]+}}  x11 = {{0x[0-9a-f]+}}
-  // CHECK-DUMP-NEXT:x12 = {{0x[0-9a-f]+}}  x13 = {{0x[0-9a-f]+}}  x14 = {{0x[0-9a-f]+}}  x15 = {{0x[0-9a-f]+}}
-  // CHECK-DUMP-NEXT:x16 = {{0x[0-9a-f]+}}  x17 = {{0x[0-9a-f]+}}  x18 = {{0x[0-9a-f]+}}  x19 = {{0x[0-9a-f]+}}
-  // CHECK-DUMP-NEXT:x20 = {{0x[0-9a-f]+}}  x21 = {{0x[0-9a-f]+}}  x22 = {{0x[0-9a-f]+}}  x23 = {{0x[0-9a-f]+}}
-  // CHECK-DUMP-NEXT:x24 = {{0x[0-9a-f]+}}  x25 = {{0x[0-9a-f]+}}  x26 = {{0x[0-9a-f]+}}  x27 = {{0x[0-9a-f]+}}
-  // CHECK-DUMP-NEXT:x28 = {{0x[0-9a-f]+}}   fp = {{0x[0-9a-f]+}}   lr = {{0x[0-9a-f]+}}   sp = {{0x[0-9a-f]+}}
-  // CHECK-NODUMP-NOT: Register values
-  return 0;
-}
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_arm.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_arm.cpp
deleted file mode 100644
index e17dbf196227b..0000000000000
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_arm.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-// Check that sanitizer prints registers dump_registers on dump_registers=1
-// RUN: %clangxx  %s -o %t
-// RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-NODUMP
-// RUN: not %run %t 2>&1 | FileCheck %s --strict-whitespace --check-prefix=CHECK-DUMP
-//
-// REQUIRES: arm-target-arch
-
-#include <signal.h>
-
-int main() {
-  raise(SIGSEGV);
-  // CHECK-DUMP: Register values
-  // CHECK-DUMP-NEXT: r0 = {{0x[0-9a-f]+}}   r1 = {{0x[0-9a-f]+}}   r2 = {{0x[0-9a-f]+}}   r3 = {{0x[0-9a-f]+}}
-  // CHECK-DUMP-NEXT: r4 = {{0x[0-9a-f]+}}   r5 = {{0x[0-9a-f]+}}   r6 = {{0x[0-9a-f]+}}   r7 = {{0x[0-9a-f]+}}
-  // CHECK-DUMP-NEXT: r8 = {{0x[0-9a-f]+}}   r9 = {{0x[0-9a-f]+}}  r10 = {{0x[0-9a-f]+}}  r11 = {{0x[0-9a-f]+}}
-  // CHECK-DUMP-NEXT:r12 = {{0x[0-9a-f]+}}   sp = {{0x[0-9a-f]+}}   lr = {{0x[0-9a-f]+}}   pc = {{0x[0-9a-f]+}}
-  // CHECK-NODUMP-NOT: Register values
-  return 0;
-}

>From 9f08ae8d2dd1ff9ec3b033d099282dee05528413 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Wed, 24 Jul 2024 16:15:58 +0200
Subject: [PATCH 020/259] [clang][Interp][NFC] Fix getting the record decl from
 a pointer type

---
 clang/lib/AST/Interp/Compiler.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/Interp/Compiler.cpp b/clang/lib/AST/Interp/Compiler.cpp
index 0455eec3f0145..58a0852aca755 100644
--- a/clang/lib/AST/Interp/Compiler.cpp
+++ b/clang/lib/AST/Interp/Compiler.cpp
@@ -5281,8 +5281,8 @@ template <class Emitter>
 unsigned Compiler<Emitter>::collectBaseOffset(const QualType BaseType,
                                               const QualType DerivedType) {
   const auto extractRecordDecl = [](QualType Ty) -> const CXXRecordDecl * {
-    if (const auto *PT = dyn_cast<PointerType>(Ty))
-      return PT->getPointeeType()->getAsCXXRecordDecl();
+    if (const auto *R = Ty->getPointeeCXXRecordDecl())
+      return R;
     return Ty->getAsCXXRecordDecl();
   };
   const CXXRecordDecl *BaseDecl = extractRecordDecl(BaseType);

>From fbd303ba5dd98d4513d62058bc1ad463507464ce Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 24 Jul 2024 16:02:22 +0100
Subject: [PATCH 021/259] [DAG] combineAndnp - fold ANDNP(x,PSHUFB(y,z)) ->
 PSHUFB(y,OR(z,x))

Based off #96754 - if we are zeroing out whole elements, then fold into the PSHUFB mask instead (if MSB is set the element is zero).

Allows us to replace the non-commutative ANDNP with a much simpler OR node.

Move this after ANDNP shuffle combining as that can do a better job of this for constant ANDNP + PSHUFB masks.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp  | 29 ++++++++++----
 llvm/test/CodeGen/X86/var-permute-128.ll | 49 +++++++++---------------
 2 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b532fdd98ce48..14a6a91262abc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53482,13 +53482,6 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
   if (SDValue Not = IsNOT(N0, DAG))
     return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
 
-  // Fold for better commutativity:
-  // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
-  if (N1->hasOneUse())
-    if (SDValue Not = IsNOT(N1, DAG))
-      return DAG.getNOT(
-          DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
-
   // Constant Folding
   APInt Undefs0, Undefs1;
   SmallVector<APInt> EltBits0, EltBits1;
@@ -53566,6 +53559,28 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // Folds for better commutativity:
+  if (N1->hasOneUse()) {
+    // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
+    if (SDValue Not = IsNOT(N1, DAG))
+      return DAG.getNOT(
+          DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
+
+    // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
+    // Zero out elements by setting the PSHUFB mask value to 0xFF.
+    if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
+      SDValue BC1 = peekThroughOneUseBitcasts(N1);
+      if (BC1.getOpcode() == X86ISD::PSHUFB) {
+        EVT ShufVT = BC1.getValueType();
+        SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
+                                      DAG.getBitcast(ShufVT, N0));
+        SDValue NewShuf =
+            DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
+        return DAG.getBitcast(VT, NewShuf);
+      }
+    }
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index 52ede755df5d6..7f4111e65cc17 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -309,9 +309,8 @@ define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounw
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; SSSE3-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    por %xmm2, %xmm1
 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
-; SSSE3-NEXT:    pandn %xmm0, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shuffle_zero_v4i32:
@@ -322,9 +321,8 @@ define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounw
 ; SSE41-NEXT:    por %xmm2, %xmm1
 ; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm1
 ; SSE41-NEXT:    pshufb %xmm1, %xmm0
-; SSE41-NEXT:    pandn %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; XOP-LABEL: var_shuffle_zero_v4i32:
@@ -545,16 +543,15 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
 ;
 ; SSSE3-LABEL: var_shuffle_zero_v8i16:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8]
-; SSSE3-NEXT:    psubusw %xmm1, %xmm3
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pcmpeqw %xmm3, %xmm2
-; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8]
+; SSSE3-NEXT:    psubusw %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pcmpeqw %xmm2, %xmm3
+; SSSE3-NEXT:    por %xmm3, %xmm1
 ; SSSE3-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514]
 ; SSSE3-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    por %xmm3, %xmm1
 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
-; SSSE3-NEXT:    pandn %xmm0, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shuffle_zero_v8i16:
@@ -565,9 +562,8 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
 ; SSE41-NEXT:    por %xmm2, %xmm1
 ; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514]
 ; SSE41-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm1
 ; SSE41-NEXT:    pshufb %xmm1, %xmm0
-; SSE41-NEXT:    pandn %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; XOP-LABEL: var_shuffle_zero_v8i16:
@@ -576,8 +572,8 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
 ; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; XOP-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
 ; XOP-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; XOP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; XOP-NEXT:    vpandn %xmm0, %xmm2, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX1-LABEL: var_shuffle_zero_v8i16:
@@ -587,8 +583,8 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
 ; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
 ; AVX1-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: var_shuffle_zero_v8i16:
@@ -598,8 +594,8 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
 ; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
 ; AVX2-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: var_shuffle_zero_v8i16:
@@ -889,10 +885,8 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; SSSE3-NEXT:    pmaxub %xmm1, %xmm2
 ; SSSE3-NEXT:    pcmpeqb %xmm1, %xmm2
-; SSSE3-NEXT:    por %xmm2, %xmm1
-; SSSE3-NEXT:    pshufb %xmm1, %xmm0
-; SSSE3-NEXT:    pandn %xmm0, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shuffle_zero_v16i8:
@@ -900,10 +894,8 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; SSE41-NEXT:    pmaxub %xmm1, %xmm2
 ; SSE41-NEXT:    pcmpeqb %xmm1, %xmm2
-; SSE41-NEXT:    por %xmm2, %xmm1
-; SSE41-NEXT:    pshufb %xmm1, %xmm0
-; SSE41-NEXT:    pandn %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm2
+; SSE41-NEXT:    pshufb %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; XOP-LABEL: var_shuffle_zero_v16i8:
@@ -911,7 +903,6 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw
 ; XOP-NEXT:    vpcomgtub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
 ; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; XOP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; XOP-NEXT:    vpandn %xmm0, %xmm2, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX1-LABEL: var_shuffle_zero_v16i8:
@@ -920,7 +911,6 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
 ; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: var_shuffle_zero_v16i8:
@@ -929,7 +919,6 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw
 ; AVX2-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
 ; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: var_shuffle_zero_v16i8:
@@ -1289,9 +1278,8 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; SSSE3-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    por %xmm2, %xmm1
 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
-; SSSE3-NEXT:    pandn %xmm0, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shuffle_zero_v4f32:
@@ -1302,9 +1290,8 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
 ; SSE41-NEXT:    por %xmm2, %xmm1
 ; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm1
 ; SSE41-NEXT:    pshufb %xmm1, %xmm0
-; SSE41-NEXT:    pandn %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; XOP-LABEL: var_shuffle_zero_v4f32:

>From 1195df25f4df987d258e6b46c090ca3e72eccecb Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 24 Jul 2024 10:06:17 -0500
Subject: [PATCH 022/259] [libc] Make the libc files always available to clang
 for the GPU (#100396)

Summary:
The GPU build always wants these to be exported to `clang`. These will
be neccesary for setting up tests or using the `libc` from the clang
binary. This will cover the standard 'runtimes' build. If the user is
doing a direct compilation, we will just leave it as-is and assume they
know what they're doing.
---
 libc/CMakeLists.txt | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 6e0760724d963..1ce3bdf53b525 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -228,7 +228,7 @@ foreach(config_path IN LISTS LIBC_CONFIG_JSON_FILE_LIST)
   load_libc_config(${config_path}/config.json ${cmd_line_conf})
 endforeach()
 
-if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND LIBC_ENABLE_USE_BY_CLANG)
+if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND (LIBC_ENABLE_USE_BY_CLANG OR LIBC_TARGET_OS_IS_GPU))
   set(LIBC_INCLUDE_DIR ${LLVM_BINARY_DIR}/include/${LLVM_DEFAULT_TARGET_TRIPLE})
   set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/${LLVM_DEFAULT_TARGET_TRIPLE})
   set(LIBC_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE})
@@ -244,9 +244,7 @@ else()
     set(LIBC_LIBRARY_DIR ${CMAKE_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX})
   endif()
   if(LIBC_TARGET_OS_IS_GPU)
-    if(LLVM_RUNTIMES_TARGET)
-      set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/${LLVM_RUNTIMES_TARGET})
-    elseif(LIBC_TARGET_TRIPLE)
+    if(LIBC_TARGET_TRIPLE)
       set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/${LIBC_TARGET_TRIPLE})
     else()
       set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/${LLVM_DEFAULT_TARGET_TRIPLE})

>From 338c35aabfbede9ba10a4f48a13e63f37d6f8c7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Wed, 24 Jul 2024 16:22:15 +0200
Subject: [PATCH 023/259] [clang][Interp] Fix calling variadic call operators

---
 clang/lib/AST/Interp/Interp.cpp |  3 ++-
 clang/test/AST/Interp/cxx20.cpp | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp
index bf29b85041d82..3694253ae9782 100644
--- a/clang/lib/AST/Interp/Interp.cpp
+++ b/clang/lib/AST/Interp/Interp.cpp
@@ -233,7 +233,8 @@ void cleanupAfterFunctionCall(InterpState &S, CodePtr OpPC) {
       assert(false && "Can't get arguments from that expression type");
 
     assert(NumArgs >= CurFunc->getNumWrittenParams());
-    NumVarArgs = NumArgs - CurFunc->getNumWrittenParams();
+    NumVarArgs = NumArgs - (CurFunc->getNumWrittenParams() +
+                            isa<CXXOperatorCallExpr>(CallSite));
     for (unsigned I = 0; I != NumVarArgs; ++I) {
       const Expr *A = Args[NumArgs - 1 - I];
       popArg(S, A);
diff --git a/clang/test/AST/Interp/cxx20.cpp b/clang/test/AST/Interp/cxx20.cpp
index 2faacbbf70fd7..da80454b7a820 100644
--- a/clang/test/AST/Interp/cxx20.cpp
+++ b/clang/test/AST/Interp/cxx20.cpp
@@ -827,3 +827,17 @@ namespace CheckingNullPtrForInitialization {
     return x;
   }
 }
+
+namespace VariadicCallOperator {
+  class F {
+  public:
+    constexpr void operator()(int a, int b, ...) {}
+  };
+  constexpr int foo() {
+    F f;
+
+    f(1,2, 3);
+    return 1;
+  }
+  constexpr int A = foo();
+}

>From 8d8fa01a66d5051356b698ef0264a01f6aade185 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 24 Jul 2024 10:08:20 -0500
Subject: [PATCH 024/259] Reapply "[libc] Remove 'packaged' GPU build support
 (#100208)"

This reverts commit 550b83d658755664a7f0f93b36242e885743a91b.
---
 libc/cmake/modules/LLVMLibCLibraryRules.cmake | 91 -------------------
 libc/docs/gpu/building.rst                    | 19 ----
 libc/docs/gpu/using.rst                       | 15 +--
 libc/lib/CMakeLists.txt                       | 15 ---
 offload/test/libc/assert.c                    |  3 +-
 offload/test/lit.cfg                          |  7 --
 6 files changed, 10 insertions(+), 140 deletions(-)

diff --git a/libc/cmake/modules/LLVMLibCLibraryRules.cmake b/libc/cmake/modules/LLVMLibCLibraryRules.cmake
index 75bc81e2aee8e..e677b4cd2c28f 100644
--- a/libc/cmake/modules/LLVMLibCLibraryRules.cmake
+++ b/libc/cmake/modules/LLVMLibCLibraryRules.cmake
@@ -83,97 +83,6 @@ function(get_all_object_file_deps result fq_deps_list)
   set(${result} ${all_deps} PARENT_SCOPE)
 endfunction()
 
-# A rule to build a library from a collection of entrypoint objects and bundle
-# it into a GPU fatbinary. Usage is the same as 'add_entrypoint_library'.
-# Usage:
-#     add_gpu_entrypoint_library(
-#       DEPENDS <list of add_entrypoint_object targets>
-#     )
-function(add_gpu_entrypoint_library target_name base_target_name)
-  cmake_parse_arguments(
-    "ENTRYPOINT_LIBRARY"
-    "" # No optional arguments
-    "" # No single value arguments
-    "DEPENDS" # Multi-value arguments
-    ${ARGN}
-  )
-  if(NOT ENTRYPOINT_LIBRARY_DEPENDS)
-    message(FATAL_ERROR "'add_entrypoint_library' target requires a DEPENDS list "
-                        "of 'add_entrypoint_object' targets.")
-  endif()
-
-  get_fq_deps_list(fq_deps_list ${ENTRYPOINT_LIBRARY_DEPENDS})
-  get_all_object_file_deps(all_deps "${fq_deps_list}")
-
-  # The GPU 'libc' needs to be exported in a format that can be linked with
-  # offloading langauges like OpenMP or CUDA. This wraps every GPU object into a
-  # fat binary and adds them to a static library.
-  set(objects "")
-  foreach(dep IN LISTS all_deps)
-    set(object $<$<STREQUAL:$<TARGET_NAME_IF_EXISTS:${dep}>,${dep}>:$<TARGET_OBJECTS:${dep}>>)
-    string(FIND ${dep} "." last_dot_loc REVERSE)
-    math(EXPR name_loc "${last_dot_loc} + 1")
-    string(SUBSTRING ${dep} ${name_loc} -1 name)
-    if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-      set(prefix --image=arch=generic,triple=nvptx64-nvidia-cuda,feature=+ptx63)
-    elseif(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
-      set(prefix --image=arch=generic,triple=amdgcn-amd-amdhsa)
-    endif()
-
-    # Use the 'clang-offload-packager' to merge these files into a binary blob.
-    add_custom_command(
-      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin"
-      COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/binary
-      COMMAND ${LIBC_CLANG_OFFLOAD_PACKAGER}
-              "${prefix},file=$<JOIN:${object},,file=>" -o
-              ${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin
-      DEPENDS ${dep} ${base_target_name}
-      COMMENT "Packaging LLVM offloading binary for '${object}'"
-    )
-    add_custom_target(${dep}.__gpubin__ DEPENDS ${dep}
-                      "${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin")
-    if(TARGET clang-offload-packager)
-      add_dependencies(${dep}.__gpubin__ clang-offload-packager)
-    endif()
-
-    # CMake does not permit setting the name on object files. In order to have
-    # human readable names we create an empty stub file with the entrypoint
-    # name. This empty file will then have the created binary blob embedded.
-    add_custom_command(
-      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp"
-      COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/stubs
-      COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp
-      DEPENDS ${dep} ${dep}.__gpubin__ ${base_target_name}
-    )
-    add_custom_target(${dep}.__stub__
-                      DEPENDS ${dep}.__gpubin__ "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp")
-
-    add_library(${dep}.__fatbin__
-      EXCLUDE_FROM_ALL OBJECT
-      "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp"
-    )
-
-    # This is always compiled for the LLVM host triple instead of the native GPU
-    # triple that is used by default in the build.
-    target_compile_options(${dep}.__fatbin__ BEFORE PRIVATE -nostdlib)
-    target_compile_options(${dep}.__fatbin__ PRIVATE
-      --target=${LLVM_HOST_TRIPLE}
-      "SHELL:-Xclang -fembed-offload-object=${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin")
-    add_dependencies(${dep}.__fatbin__
-                     ${dep} ${dep}.__stub__ ${dep}.__gpubin__ ${base_target_name})
-
-    # Set the list of newly create fat binaries containing embedded device code.
-    list(APPEND objects $<TARGET_OBJECTS:${dep}.__fatbin__>)
-  endforeach()
-
-  add_library(
-    ${target_name}
-    STATIC
-      ${objects}
-  )
-  set_target_properties(${target_name} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${LIBC_LIBRARY_DIR})
-endfunction(add_gpu_entrypoint_library)
-
 # A rule to build a library from a collection of entrypoint objects and bundle
 # it in a single LLVM-IR bitcode file.
 # Usage:
diff --git a/libc/docs/gpu/building.rst b/libc/docs/gpu/building.rst
index d3e64c6d42431..60498e348395a 100644
--- a/libc/docs/gpu/building.rst
+++ b/libc/docs/gpu/building.rst
@@ -151,25 +151,6 @@ Build overview
 Once installed, the GPU build will create several files used for different
 targets. This section will briefly describe their purpose.
 
-**lib/<host-triple>/libcgpu-amdgpu.a or lib/libcgpu-amdgpu.a**
-  A static library containing fat binaries supporting AMD GPUs. These are built
-  using the support described in the `clang documentation
-  <https://clang.llvm.org/docs/OffloadingDesign.html>`_. These are intended to
-  be static libraries included natively for offloading languages like CUDA, HIP,
-  or OpenMP. This implements the standard C library.
-
-**lib/<host-triple>/libmgpu-amdgpu.a or lib/libmgpu-amdgpu.a**
-  A static library containing fat binaries that implements the standard math
-  library for AMD GPUs.
-
-**lib/<host-triple>/libcgpu-nvptx.a or lib/libcgpu-nvptx.a**
-  A static library containing fat binaries that implement the standard C library
-  for NVIDIA GPUs.
-
-**lib/<host-triple>/libmgpu-nvptx.a or lib/libmgpu-nvptx.a**
-  A static library containing fat binaries that implement the standard math
-  library for NVIDIA GPUs.
-
 **include/<target-triple>**
   The include directory where all of the generated headers for the target will
   go. These definitions are strictly for the GPU when being targeted directly.
diff --git a/libc/docs/gpu/using.rst b/libc/docs/gpu/using.rst
index d5ad4c7a0368d..4034c04867c99 100644
--- a/libc/docs/gpu/using.rst
+++ b/libc/docs/gpu/using.rst
@@ -34,16 +34,17 @@ described in the `clang documentation
 by the OpenMP toolchain, but is currently opt-in for the CUDA and HIP toolchains
 through the ``--offload-new-driver``` and ``-fgpu-rdc`` flags.
 
-The installation should contain a static library called ``libcgpu-amdgpu.a`` or
-``libcgpu-nvptx.a`` depending on which GPU architectures your build targeted.
-These contain fat binaries compatible with the offloading toolchain such that
-they can be used directly.
+In order or link the GPU runtime, we simply pass this library to the embedded 
+device linker job. This can be done using the ``-Xoffload-linker`` option, which 
+forwards an argument to a ``clang`` job used to create the final GPU executable. 
+The toolchain should pick up the C libraries automatically in most cases, so 
+this shouldn't be necessary.
 
 .. code-block:: sh
 
-  $> clang openmp.c -fopenmp --offload-arch=gfx90a -lcgpu-amdgpu
-  $> clang cuda.cu --offload-arch=sm_80 --offload-new-driver -fgpu-rdc -lcgpu-nvptx
-  $> clang hip.hip --offload-arch=gfx940 --offload-new-driver -fgpu-rdc -lcgpu-amdgpu
+  $> clang openmp.c -fopenmp --offload-arch=gfx90a -Xoffload-linker -lc
+  $> clang cuda.cu --offload-arch=sm_80 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc
+  $> clang hip.hip --offload-arch=gfx940 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc
 
 This will automatically link in the needed function definitions if they were
 required by the user's application. Normally using the ``-fgpu-rdc`` option
diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt
index 37acf3950b460..4b7cfc4b76e2e 100644
--- a/libc/lib/CMakeLists.txt
+++ b/libc/lib/CMakeLists.txt
@@ -40,20 +40,6 @@ foreach(archive IN ZIP_LISTS
   # Add the offloading version of the library for offloading languages. These
   # are installed in the standard search path separate from the other libraries.
   if(LIBC_TARGET_OS_IS_GPU)
-    add_gpu_entrypoint_library(
-      ${archive_1}gpu
-      ${archive_1}
-      DEPENDS
-        ${${archive_2}}
-    )
-    set_target_properties(
-      ${archive_1}gpu
-      PROPERTIES
-        ARCHIVE_OUTPUT_NAME ${archive_0}gpu-${LIBC_TARGET_ARCHITECTURE}
-        ARCHIVE_OUTPUT_DIRECTORY ${LLVM_LIBRARY_OUTPUT_INTDIR}
-    )
-    list(APPEND added_gpu_archive_targets ${archive_1}gpu)
-
     add_bitcode_entrypoint_library(
       ${archive_1}bitcode
       ${archive_1}
@@ -65,7 +51,6 @@ foreach(archive IN ZIP_LISTS
       PROPERTIES
         OUTPUT_NAME ${archive_1}.bc
     )
-    add_dependencies(${archive_1}gpu ${archive_1}bitcode)
     list(APPEND added_gpu_bitcode_targets ${archive_1}bitcode)
   endif()
 endforeach()
diff --git a/offload/test/libc/assert.c b/offload/test/libc/assert.c
index c2c5055c47051..0501e36329024 100644
--- a/offload/test/libc/assert.c
+++ b/offload/test/libc/assert.c
@@ -3,8 +3,9 @@
 
 // REQUIRES: libc
 
-// NVPTX without LTO uses the implementation in OpenMP currently.
+// AMDGPU and NVPTX without LTO uses the implementation in OpenMP currently.
 // UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: amdgcn-amd-amdhsa
 // REQUIRES: gpu
 
 #include <assert.h>
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index 99d7defdb9e11..0479295c6cacc 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -179,13 +179,6 @@ def remove_suffix_if_present(name):
         return name
 
 def add_libraries(source):
-    if config.libomptarget_has_libc:
-        if config.libomptarget_current_target.startswith('nvptx'):
-            return source + " " + config.llvm_library_dir + "/libcgpu-nvptx.a " + \
-                   config.llvm_library_intdir + "/libomptarget.devicertl.a"
-        elif config.libomptarget_current_target.startswith('amdgcn'):
-            return source + " " + config.llvm_library_dir + "/libcgpu-amdgpu.a " + \
-                   config.llvm_library_intdir + "/libomptarget.devicertl.a"
     return source + " " + config.llvm_library_intdir + "/libomptarget.devicertl.a"
 
 # Add platform targets

>From 7d787df5b932b73aae6532d1e981152f103f9244 Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot at gmail.com>
Date: Wed, 24 Jul 2024 17:27:58 +0200
Subject: [PATCH 025/259] [Clang][NFC] Simplify initialization of
 `OverloadCandidate` objects. (#100318)

Initialize some fields of OverloadCandidate in its constructor. The goal
here is try to fix read of uninitialized variable (which I was not able
to reproduce)
https://github.com/llvm/llvm-project/pull/93430#issuecomment-2187544278

We should certainly try to improve the construction of
`OverloadCandidate` further as it can be quite britle.
---
 clang/include/clang/Sema/Overload.h |  4 +++-
 clang/lib/Sema/SemaOverload.cpp     | 20 +-------------------
 2 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h
index 9d8b797af6663..26ffe057c74a2 100644
--- a/clang/include/clang/Sema/Overload.h
+++ b/clang/include/clang/Sema/Overload.h
@@ -998,7 +998,9 @@ class Sema;
   private:
     friend class OverloadCandidateSet;
     OverloadCandidate()
-        : IsSurrogate(false), IsADLCandidate(CallExpr::NotADL), RewriteKind(CRK_None) {}
+        : IsSurrogate(false), IgnoreObjectArgument(false),
+          TookAddressOfOverload(false), IsADLCandidate(CallExpr::NotADL),
+          RewriteKind(CRK_None) {}
   };
 
   /// OverloadCandidateSet - A set of overload candidates, used in C++
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index a8d250fbabfed..554a2df14bea6 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -6857,10 +6857,7 @@ void Sema::AddOverloadCandidate(
   Candidate.Viable = true;
   Candidate.RewriteKind =
       CandidateSet.getRewriteInfo().getRewriteKind(Function, PO);
-  Candidate.IsSurrogate = false;
   Candidate.IsADLCandidate = IsADLCandidate;
-  Candidate.IgnoreObjectArgument = false;
-  Candidate.TookAddressOfOverload = false;
   Candidate.ExplicitCallArguments = Args.size();
 
   // Explicit functions are not actually candidates at all if we're not
@@ -7422,8 +7419,6 @@ Sema::AddMethodCandidate(CXXMethodDecl *Method, DeclAccessPair FoundDecl,
   Candidate.Function = Method;
   Candidate.RewriteKind =
       CandidateSet.getRewriteInfo().getRewriteKind(Method, PO);
-  Candidate.IsSurrogate = false;
-  Candidate.IgnoreObjectArgument = false;
   Candidate.TookAddressOfOverload =
       CandidateSet.getKind() == OverloadCandidateSet::CSK_AddressOfOverloadSet;
   Candidate.ExplicitCallArguments = Args.size();
@@ -7617,7 +7612,6 @@ void Sema::AddMethodTemplateCandidate(
     Candidate.IgnoreObjectArgument =
         cast<CXXMethodDecl>(Candidate.Function)->isStatic() ||
         ObjectType.isNull();
-    Candidate.TookAddressOfOverload = false;
     Candidate.ExplicitCallArguments = Args.size();
     if (Result == TemplateDeductionResult::NonDependentConversionFailure)
       Candidate.FailureKind = ovl_fail_bad_conversion;
@@ -7705,7 +7699,6 @@ void Sema::AddTemplateOverloadCandidate(
     Candidate.IgnoreObjectArgument =
         isa<CXXMethodDecl>(Candidate.Function) &&
         !isa<CXXConstructorDecl>(Candidate.Function);
-    Candidate.TookAddressOfOverload = false;
     Candidate.ExplicitCallArguments = Args.size();
     if (Result == TemplateDeductionResult::NonDependentConversionFailure)
       Candidate.FailureKind = ovl_fail_bad_conversion;
@@ -7886,9 +7879,6 @@ void Sema::AddConversionCandidate(
   OverloadCandidate &Candidate = CandidateSet.addCandidate(1);
   Candidate.FoundDecl = FoundDecl;
   Candidate.Function = Conversion;
-  Candidate.IsSurrogate = false;
-  Candidate.IgnoreObjectArgument = false;
-  Candidate.TookAddressOfOverload = false;
   Candidate.FinalConversion.setAsIdentityConversion();
   Candidate.FinalConversion.setFromType(ConvType);
   Candidate.FinalConversion.setAllToTypes(ToType);
@@ -8084,9 +8074,6 @@ void Sema::AddTemplateConversionCandidate(
     Candidate.Function = FunctionTemplate->getTemplatedDecl();
     Candidate.Viable = false;
     Candidate.FailureKind = ovl_fail_bad_deduction;
-    Candidate.IsSurrogate = false;
-    Candidate.IgnoreObjectArgument = false;
-    Candidate.TookAddressOfOverload = false;
     Candidate.ExplicitCallArguments = 1;
     Candidate.DeductionFailure = MakeDeductionFailureInfo(Context, Result,
                                                           Info);
@@ -8119,10 +8106,8 @@ void Sema::AddSurrogateCandidate(CXXConversionDecl *Conversion,
   Candidate.FoundDecl = FoundDecl;
   Candidate.Function = nullptr;
   Candidate.Surrogate = Conversion;
-  Candidate.Viable = true;
   Candidate.IsSurrogate = true;
-  Candidate.IgnoreObjectArgument = false;
-  Candidate.TookAddressOfOverload = false;
+  Candidate.Viable = true;
   Candidate.ExplicitCallArguments = Args.size();
 
   // Determine the implicit conversion sequence for the implicit
@@ -8328,9 +8313,6 @@ void Sema::AddBuiltinCandidate(QualType *ParamTys, ArrayRef<Expr *> Args,
   OverloadCandidate &Candidate = CandidateSet.addCandidate(Args.size());
   Candidate.FoundDecl = DeclAccessPair::make(nullptr, AS_none);
   Candidate.Function = nullptr;
-  Candidate.IsSurrogate = false;
-  Candidate.IgnoreObjectArgument = false;
-  Candidate.TookAddressOfOverload = false;
   std::copy(ParamTys, ParamTys + Args.size(), Candidate.BuiltinParamTypes);
 
   // Determine the implicit conversion sequences for each of the

>From 8a77961280536b680c404a49002a00b988ca45fc Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan at arm.com>
Date: Wed, 24 Jul 2024 16:28:24 +0100
Subject: [PATCH 026/259] [Flang][Driver] Enable config file options (#100343)

Config files provide a facility to invoke the compiler with a predefined
set of options. The patch only enables these options in the flang
driver. Functionality was always there.
---
 clang/include/clang/Driver/Options.td         | 10 +--
 flang/test/Driver/Inputs/config-1.cfg         |  1 +
 flang/test/Driver/Inputs/config-2.cfg         |  1 +
 flang/test/Driver/Inputs/config-2a.cfg        |  1 +
 flang/test/Driver/Inputs/config-6.cfg         |  1 +
 flang/test/Driver/Inputs/config/config-4.cfg  |  1 +
 flang/test/Driver/Inputs/config2/config-4.cfg |  1 +
 flang/test/Driver/config-file.f90             | 63 +++++++++++++++++++
 8 files changed, 74 insertions(+), 5 deletions(-)
 create mode 100644 flang/test/Driver/Inputs/config-1.cfg
 create mode 100644 flang/test/Driver/Inputs/config-2.cfg
 create mode 100644 flang/test/Driver/Inputs/config-2a.cfg
 create mode 100644 flang/test/Driver/Inputs/config-6.cfg
 create mode 100644 flang/test/Driver/Inputs/config/config-4.cfg
 create mode 100644 flang/test/Driver/Inputs/config2/config-4.cfg
 create mode 100644 flang/test/Driver/config-file.f90

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 39ed8f8d232a7..28a5cb71d2219 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1165,19 +1165,19 @@ def client__name : JoinedOrSeparate<["-"], "client_name">;
 def combine : Flag<["-", "--"], "combine">, Flags<[NoXarchOption, Unsupported]>;
 def compatibility__version : JoinedOrSeparate<["-"], "compatibility_version">;
 def config : Joined<["--"], "config=">, Flags<[NoXarchOption]>,
-  Visibility<[ClangOption, CLOption, DXCOption]>, MetaVarName<"<file>">,
+  Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>, MetaVarName<"<file>">,
   HelpText<"Specify configuration file">;
-def : Separate<["--"], "config">, Alias<config>;
+def : Separate<["--"], "config">, Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>, Alias<config>;
 def no_default_config : Flag<["--"], "no-default-config">,
-  Flags<[NoXarchOption]>, Visibility<[ClangOption, CLOption, DXCOption]>,
+  Flags<[NoXarchOption]>, Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>,
   HelpText<"Disable loading default configuration files">;
 def config_system_dir_EQ : Joined<["--"], "config-system-dir=">,
   Flags<[NoXarchOption, HelpHidden]>,
-  Visibility<[ClangOption, CLOption, DXCOption]>,
+  Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>,
   HelpText<"System directory for configuration files">;
 def config_user_dir_EQ : Joined<["--"], "config-user-dir=">,
   Flags<[NoXarchOption, HelpHidden]>,
-  Visibility<[ClangOption, CLOption, DXCOption]>,
+  Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>,
   HelpText<"User directory for configuration files">;
 def coverage : Flag<["-", "--"], "coverage">, Group<Link_Group>,
   Visibility<[ClangOption, CLOption]>;
diff --git a/flang/test/Driver/Inputs/config-1.cfg b/flang/test/Driver/Inputs/config-1.cfg
new file mode 100644
index 0000000000000..824e128a42b63
--- /dev/null
+++ b/flang/test/Driver/Inputs/config-1.cfg
@@ -0,0 +1 @@
+-flto
diff --git a/flang/test/Driver/Inputs/config-2.cfg b/flang/test/Driver/Inputs/config-2.cfg
new file mode 100644
index 0000000000000..4e8d01b668e83
--- /dev/null
+++ b/flang/test/Driver/Inputs/config-2.cfg
@@ -0,0 +1 @@
+-fno-signed-zeros
diff --git a/flang/test/Driver/Inputs/config-2a.cfg b/flang/test/Driver/Inputs/config-2a.cfg
new file mode 100644
index 0000000000000..cd2916c98afe2
--- /dev/null
+++ b/flang/test/Driver/Inputs/config-2a.cfg
@@ -0,0 +1 @@
+-fopenmp
diff --git a/flang/test/Driver/Inputs/config-6.cfg b/flang/test/Driver/Inputs/config-6.cfg
new file mode 100644
index 0000000000000..81e9830f63be4
--- /dev/null
+++ b/flang/test/Driver/Inputs/config-6.cfg
@@ -0,0 +1 @@
+-fstack-arrays
diff --git a/flang/test/Driver/Inputs/config/config-4.cfg b/flang/test/Driver/Inputs/config/config-4.cfg
new file mode 100644
index 0000000000000..d15a7108d4e21
--- /dev/null
+++ b/flang/test/Driver/Inputs/config/config-4.cfg
@@ -0,0 +1 @@
+-O3
diff --git a/flang/test/Driver/Inputs/config2/config-4.cfg b/flang/test/Driver/Inputs/config2/config-4.cfg
new file mode 100644
index 0000000000000..9d1c3e38c8680
--- /dev/null
+++ b/flang/test/Driver/Inputs/config2/config-4.cfg
@@ -0,0 +1 @@
+-ffp-contract=fast
diff --git a/flang/test/Driver/config-file.f90 b/flang/test/Driver/config-file.f90
new file mode 100644
index 0000000000000..70316dd971f36
--- /dev/null
+++ b/flang/test/Driver/config-file.f90
@@ -0,0 +1,63 @@
+!--- Config file (full path) in output of -###
+!
+! RUN: %flang --config-system-dir=%S/Inputs/config --config-user-dir=%S/Inputs/config2 -o /dev/null -v 2>&1 | FileCheck %s -check-prefix CHECK-DIRS
+! CHECK-DIRS: System configuration file directory: {{.*}}/Inputs/config
+! CHECK-DIRS: User configuration file directory: {{.*}}/Inputs/config2
+!
+!--- Config file (full path) in output of -###
+!
+! RUN: %flang --config %S/Inputs/config-1.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-HHH
+! RUN: %flang --config=%S/Inputs/config-1.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-HHH
+! CHECK-HHH: Configuration file: {{.*}}Inputs{{.}}config-1.cfg
+! CHECK-HHH: -flto
+!
+!
+!--- Config file (full path) in output of -v
+!
+! RUN: %flang --config %S/Inputs/config-1.cfg -S %s -o /dev/null -v 2>&1 | FileCheck %s -check-prefix CHECK-V
+! CHECK-V: Configuration file: {{.*}}Inputs{{.}}config-1.cfg
+! CHECK-V: -flto
+!
+!--- Config file in output of -###
+!
+! RUN: %flang --config-system-dir=%S/Inputs --config-user-dir= --config config-1.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-HHH2
+! CHECK-HHH2: Configuration file: {{.*}}Inputs{{.}}config-1.cfg
+! CHECK-HHH2: -flto
+!
+!--- Config file in output of -v
+!
+! RUN: %flang --config-system-dir=%S/Inputs --config-user-dir= --config config-1.cfg -S %s -o /dev/null -v 2>&1 | FileCheck %s -check-prefix CHECK-V2
+! CHECK-V2: Configuration file: {{.*}}Inputs{{.}}config-1.cfg
+! CHECK-V2: -flto
+!
+!--- Nested config files
+!
+! RUN: %flang --config-system-dir=%S/Inputs --config-user-dir= --config config-2.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-NESTED
+! CHECK-NESTED: Configuration file: {{.*}}Inputs{{.}}config-2.cfg
+! CHECK-NESTED: -fno-signed-zeros
+!
+! RUN: %flang --config-system-dir=%S/Inputs --config-user-dir=%S/Inputs/config --config config-6.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-NESTED2
+! CHECK-NESTED2: Configuration file: {{.*}}Inputs{{.}}config-6.cfg
+! CHECK-NESTED2: -fstack-arrays
+!
+!
+! RUN: %flang --config %S/Inputs/config-2a.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-NESTEDa
+! CHECK-NESTEDa: Configuration file: {{.*}}Inputs{{.}}config-2a.cfg
+! CHECK-NESTEDa: -fopenmp
+!
+! RUN: %flang --config-system-dir=%S/Inputs --config-user-dir= --config config-2a.cfg -S %s -### 2>&1 | FileCheck %s -check-prefix CHECK-NESTED2a
+! CHECK-NESTED2a: Configuration file: {{.*}}Inputs{{.}}config-2a.cfg
+! CHECK-NESTED2a: -fopenmp
+!
+!--- User directory is searched first.
+!
+! RUN: %flang --config-system-dir=%S/Inputs/config --config-user-dir=%S/Inputs/config2 --config config-4.cfg -S %s -o /dev/null -v 2>&1 | FileCheck %s -check-prefix CHECK-PRECEDENCE
+! CHECK-PRECEDENCE: Configuration file: {{.*}}Inputs{{.}}config2{{.}}config-4.cfg
+! CHECK-PRECEDENCE: -ffp-contract=fast
+!
+!--- Multiple configuration files can be specified.
+! RUN: %flang --config-system-dir=%S/Inputs/config --config-user-dir= --config config-4.cfg --config %S/Inputs/config2/config-4.cfg -S %s -o /dev/null -v 2>&1 | FileCheck %s -check-prefix CHECK-TWO-CONFIGS
+! CHECK-TWO-CONFIGS: Configuration file: {{.*}}Inputs{{.}}config{{.}}config-4.cfg
+! CHECK-TWO-CONFIGS-NEXT: Configuration file: {{.*}}Inputs{{.}}config2{{.}}config-4.cfg
+! CHECK-TWO-CONFIGS: -ffp-contract=fast
+! CHECK-TWO-CONFIGS: -O3

>From dd82a84e0eeafb017c7220c4a9fbd0a8a407f8a9 Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot at gmail.com>
Date: Wed, 24 Jul 2024 17:28:44 +0200
Subject: [PATCH 027/259] [Clang] Fix an assertion failure introduced by #93430
 (#100313)

The PR #93430 introduced an assertion that did not make any sense. and
caused a regression. The fix is to simply remove the assertion.

No changelog. the intent is to backport this fix to clang 19.
---
 clang/lib/Sema/SemaExpr.cpp                | 1 -
 clang/test/SemaCXX/cxx2b-deducing-this.cpp | 6 ++++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 9207bf7a41349..206194930f3b4 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -5730,7 +5730,6 @@ static bool isParenthetizedAndQualifiedAddressOfExpr(Expr *Fn) {
   if (!UO || UO->getOpcode() != clang::UO_AddrOf)
     return false;
   if (auto *DRE = dyn_cast<DeclRefExpr>(UO->getSubExpr()->IgnoreParens())) {
-    assert(isa<FunctionDecl>(DRE->getDecl()) && "expected a function");
     return DRE->hasQualifier();
   }
   if (auto *OVL = dyn_cast<OverloadExpr>(UO->getSubExpr()->IgnoreParens()))
diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
index 5cbc1f735383b..4811b6052254c 100644
--- a/clang/test/SemaCXX/cxx2b-deducing-this.cpp
+++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
@@ -895,6 +895,10 @@ void g() {
 }
 
 namespace P2797 {
+
+int bar(void) { return 55; }
+int (&fref)(void) = bar;
+
 struct C {
   void c(this const C&);    // #first
   void c() &;               // #second
@@ -915,6 +919,8 @@ struct C {
     (&C::c)(C{});
     (&C::c)(*this);     // expected-error {{call to non-static member function without an object argument}}
     (&C::c)();
+
+    (&fref)();
   }
 };
 }

>From e1a3aa8c0fb0103427c45a5c861fc98aa44a4821 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Wed, 24 Jul 2024 16:32:24 +0100
Subject: [PATCH 028/259] LV/Legality: fix style after cursory reading (NFC)
 (#100363)

---
 .../Vectorize/LoopVectorizationLegality.cpp   | 86 +++++++++----------
 1 file changed, 39 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index cafec165f6d6f..5babb1d4c1f96 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -47,7 +47,7 @@ cl::opt<bool>
     HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
                          cl::desc("Allow enabling loop hints to reorder "
                                   "FP operations during vectorization."));
-}
+} // namespace llvm
 
 // TODO: Move size-based thresholds out of legality checking, make cost based
 // decisions instead of hard thresholds.
@@ -216,20 +216,19 @@ void LoopVectorizeHints::emitRemarkWithHints() const {
                                       TheLoop->getStartLoc(),
                                       TheLoop->getHeader())
              << "loop not vectorized: vectorization is explicitly disabled";
-    else {
-      OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
-                                 TheLoop->getStartLoc(), TheLoop->getHeader());
-      R << "loop not vectorized";
-      if (Force.Value == LoopVectorizeHints::FK_Enabled) {
-        R << " (Force=" << NV("Force", true);
-        if (Width.Value != 0)
-          R << ", Vector Width=" << NV("VectorWidth", getWidth());
-        if (getInterleave() != 0)
-          R << ", Interleave Count=" << NV("InterleaveCount", getInterleave());
-        R << ")";
-      }
-      return R;
+
+    OptimizationRemarkMissed R(LV_NAME, "MissedDetails", TheLoop->getStartLoc(),
+                               TheLoop->getHeader());
+    R << "loop not vectorized";
+    if (Force.Value == LoopVectorizeHints::FK_Enabled) {
+      R << " (Force=" << NV("Force", true);
+      if (Width.Value != 0)
+        R << ", Vector Width=" << NV("VectorWidth", getWidth());
+      if (getInterleave() != 0)
+        R << ", Interleave Count=" << NV("InterleaveCount", getInterleave());
+      R << ")";
     }
+    return R;
   });
 }
 
@@ -271,8 +270,8 @@ void LoopVectorizeHints::getHintsFromMetadata() {
       if (!MD || MD->getNumOperands() == 0)
         continue;
       S = dyn_cast<MDString>(MD->getOperand(0));
-      for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
-        Args.push_back(MD->getOperand(i));
+      for (unsigned Idx = 1; Idx < MD->getNumOperands(); ++Idx)
+        Args.push_back(MD->getOperand(Idx));
     } else {
       S = dyn_cast<MDString>(MDO);
       assert(Args.size() == 0 && "too many arguments for MDString");
@@ -444,10 +443,7 @@ static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A,
     return true;
 
   // Otherwise compare address SCEVs
-  if (SE->getSCEV(APtr) == SE->getSCEV(BPtr))
-    return true;
-
-  return false;
+  return SE->getSCEV(APtr) == SE->getSCEV(BPtr);
 }
 
 int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
@@ -734,26 +730,21 @@ bool LoopVectorizationLegality::setupOuterLoopInductions() {
   BasicBlock *Header = TheLoop->getHeader();
 
   // Returns true if a given Phi is a supported induction.
-  auto isSupportedPhi = [&](PHINode &Phi) -> bool {
+  auto IsSupportedPhi = [&](PHINode &Phi) -> bool {
     InductionDescriptor ID;
     if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) &&
         ID.getKind() == InductionDescriptor::IK_IntInduction) {
       addInductionPhi(&Phi, ID, AllowedExit);
       return true;
-    } else {
-      // Bail out for any Phi in the outer loop header that is not a supported
-      // induction.
-      LLVM_DEBUG(
-          dbgs()
-          << "LV: Found unsupported PHI for outer loop vectorization.\n");
-      return false;
     }
+    // Bail out for any Phi in the outer loop header that is not a supported
+    // induction.
+    LLVM_DEBUG(
+        dbgs() << "LV: Found unsupported PHI for outer loop vectorization.\n");
+    return false;
   };
 
-  if (llvm::all_of(Header->phis(), isSupportedPhi))
-    return true;
-  else
-    return false;
+  return llvm::all_of(Header->phis(), IsSupportedPhi);
 }
 
 /// Checks if a function is scalarizable according to the TLI, in
@@ -837,13 +828,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // historical vectorizer behavior after a generalization of the
         // IVDescriptor code.  The intent is to remove this check, but we
         // have to fix issues around code quality for such loops first.
-        auto isDisallowedStridedPointerInduction =
-          [](const InductionDescriptor &ID) {
-          if (AllowStridedPointerIVs)
-            return false;
-          return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
-            ID.getConstIntStepValue() == nullptr;
-        };
+        auto IsDisallowedStridedPointerInduction =
+            [](const InductionDescriptor &ID) {
+              if (AllowStridedPointerIVs)
+                return false;
+              return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
+                     ID.getConstIntStepValue() == nullptr;
+            };
 
         // TODO: Instead of recording the AllowedExit, it would be good to
         // record the complementary set: NotAllowedExit. These include (but may
@@ -861,7 +852,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // of these NotAllowedExit.
         InductionDescriptor ID;
         if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) &&
-            !isDisallowedStridedPointerInduction(ID)) {
+            !IsDisallowedStridedPointerInduction(ID)) {
           addInductionPhi(Phi, ID, AllowedExit);
           Requirements->addExactFPMathInst(ID.getExactFPMathInst());
           continue;
@@ -876,7 +867,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // As a last resort, coerce the PHI to a AddRec expression
         // and re-try classifying it a an induction PHI.
         if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) &&
-            !isDisallowedStridedPointerInduction(ID)) {
+            !IsDisallowedStridedPointerInduction(ID)) {
           addInductionPhi(Phi, ID, AllowedExit);
           continue;
         }
@@ -932,9 +923,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       if (CI) {
         auto *SE = PSE.getSE();
         Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
-        for (unsigned i = 0, e = CI->arg_size(); i != e; ++i)
-          if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, i)) {
-            if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) {
+        for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx)
+          if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx)) {
+            if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)),
+                                     TheLoop)) {
               reportVectorizationFailure("Found unvectorizable intrinsic",
                   "intrinsic instruction cannot be vectorized",
                   "CantVectorizeIntrinsic", ORE, TheLoop, CI);
@@ -1035,14 +1027,14 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           "loop induction variable could not be identified",
           "NoInductionVariable", ORE, TheLoop);
       return false;
-    } else if (!WidestIndTy) {
+    }
+    if (!WidestIndTy) {
       reportVectorizationFailure("Did not find one integer induction var",
           "integer loop induction variable could not be identified",
           "NoIntegerInductionVariable", ORE, TheLoop);
       return false;
-    } else {
-      LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
     }
+    LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
   }
 
   // Now we know the widest induction type, check if our found induction

>From 4e28d30e335e6e075f4624ae8d5782fef6d3f1ed Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Wed, 24 Jul 2024 16:32:49 +0100
Subject: [PATCH 029/259] LV/Legality: update outdated comment for isInvariant
 (NFC) (#100366)

---
 .../llvm/Transforms/Vectorize/LoopVectorizationLegality.h     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 2ff17bd2f7a71..0f4d1355dd2bf 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -349,8 +349,8 @@ class LoopVectorizationLegality {
   /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).
   int isConsecutivePtr(Type *AccessTy, Value *Ptr) const;
 
-  /// Returns true if value V is uniform across \p VF lanes, when \p VF is
-  /// provided, and otherwise if \p V is invariant across all loop iterations.
+  /// Returns true if \p V is invariant across all loop iterations according to
+  /// SCEV.
   bool isInvariant(Value *V) const;
 
   /// Returns true if value V is uniform across \p VF lanes, when \p VF is

>From 0c03b4ce1081383b7649ac0f5ba3cbaffaf5bf28 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Wed, 24 Jul 2024 23:39:30 +0800
Subject: [PATCH 030/259] [InstCombine] Infer sub nuw from dominating
 conditions (#100164)

Alive2: https://alive2.llvm.org/ce/z/g3xxnM
---
 llvm/lib/Analysis/ValueTracking.cpp           |  17 +--
 .../Transforms/InstCombine/abs-intrinsic.ll   |   2 +-
 llvm/test/Transforms/InstCombine/sub.ll       | 136 ++++++++++++++++++
 3 files changed, 144 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 4b77c0046cc70..28eee9b59c043 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7110,16 +7110,13 @@ OverflowResult llvm::computeOverflowForUnsignedSub(const Value *LHS,
     if (isGuaranteedNotToBeUndef(LHS, SQ.AC, SQ.CxtI, SQ.DT))
       return OverflowResult::NeverOverflows;
 
-  // Checking for conditions implied by dominating conditions may be expensive.
-  // Limit it to usub_with_overflow calls for now.
-  if (match(SQ.CxtI,
-            m_Intrinsic<Intrinsic::usub_with_overflow>(m_Value(), m_Value())))
-    if (auto C = isImpliedByDomCondition(CmpInst::ICMP_UGE, LHS, RHS, SQ.CxtI,
-                                         SQ.DL)) {
-      if (*C)
-        return OverflowResult::NeverOverflows;
-      return OverflowResult::AlwaysOverflowsLow;
-    }
+  if (auto C = isImpliedByDomCondition(CmpInst::ICMP_UGE, LHS, RHS, SQ.CxtI,
+                                       SQ.DL)) {
+    if (*C)
+      return OverflowResult::NeverOverflows;
+    return OverflowResult::AlwaysOverflowsLow;
+  }
+
   ConstantRange LHSRange =
       computeConstantRangeIncludingKnownBits(LHS, /*ForSigned=*/false, SQ);
   ConstantRange RHSRange =
diff --git a/llvm/test/Transforms/InstCombine/abs-intrinsic.ll b/llvm/test/Transforms/InstCombine/abs-intrinsic.ll
index b6740374c0791..f7c639f1d8e6b 100644
--- a/llvm/test/Transforms/InstCombine/abs-intrinsic.ll
+++ b/llvm/test/Transforms/InstCombine/abs-intrinsic.ll
@@ -800,7 +800,7 @@ define i32 @sub_abs_wrong_pred(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
 ; CHECK:       cond.true:
-; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 [[X]], [[Y]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.abs.i32(i32 [[SUB]], i1 true)
 ; CHECK-NEXT:    br label [[COND_END]]
 ; CHECK:       cond.end:
diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll
index 49770e8d9e43f..cb308ab66b093 100644
--- a/llvm/test/Transforms/InstCombine/sub.ll
+++ b/llvm/test/Transforms/InstCombine/sub.ll
@@ -2661,3 +2661,139 @@ define i8 @sub_of_adds_2xc(i8 %x, i8 %y) {
   %r = sub i8 %xc, %yc
   ret i8 %r
 }
+
+define i32 @sub_infer_nuw_from_domcond(i32 %x, i32 %y) {
+; CHECK-LABEL: @sub_infer_nuw_from_domcond(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ult i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw i32 [[X]], [[Y]]
+; CHECK-NEXT:    ret i32 [[SUB]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i32 0
+;
+  %cond = icmp uge i32 %x, %y
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %sub = sub i32 %x, %y
+  ret i32 %sub
+
+if.else:
+  ret i32 0
+}
+
+define i1 @sub_infer_nuw_from_domcond_fold1(i32 %x, i32 %y) {
+; CHECK-LABEL: @sub_infer_nuw_from_domcond_fold1(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ult i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cond = icmp uge i32 %x, %y
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %ext0 = zext i32 %y to i64
+  %ext1 = zext i32 %x to i64
+  %sub = sub i32 %x, %y
+  %ext2 = zext i32 %sub to i64
+  %add = add nuw nsw i64 %ext2, %ext0
+  %cmp = icmp ugt i64 %add, %ext1
+  ret i1 %cmp
+
+if.else:
+  ret i1 false
+}
+
+define i64 @sub_infer_nuw_from_domcond_fold2(i32 range(i32 0, 2147483648) %x, i32 range(i32 0, 2147483648) %y) {
+; CHECK-LABEL: @sub_infer_nuw_from_domcond_fold2(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ult i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg i32 [[SUB]] to i64
+; CHECK-NEXT:    ret i64 [[EXT]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i64 0
+;
+  %cond = icmp uge i32 %x, %y
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %sub = sub i32 %x, %y
+  %ext = zext i32 %sub to i64
+  ret i64 %ext
+
+if.else:
+  ret i64 0
+}
+
+define i1 @sub_infer_nuw_from_domcond_fold3(i16 %xx, i32 range(i32 0, 12) %y) {
+; CHECK-LABEL: @sub_infer_nuw_from_domcond_fold3(
+; CHECK-NEXT:    [[X:%.*]] = zext i16 [[XX:%.*]] to i32
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %x = zext i16 %xx to i32
+  %cond = icmp ult i32 %x, %y
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %sub = sub nsw i32 %y, %x
+  %cmp = icmp eq i32 %sub, -1
+  ret i1 %cmp
+
+if.else:
+  ret i1 false
+}
+
+; negative tests
+
+define i32 @sub_infer_nuw_from_domcond_wrong_pred(i32 %x, i32 %y) {
+; CHECK-LABEL: @sub_infer_nuw_from_domcond_wrong_pred(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp slt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[X]], [[Y]]
+; CHECK-NEXT:    ret i32 [[SUB]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i32 0
+;
+  %cond = icmp sge i32 %x, %y
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %sub = sub i32 %x, %y
+  ret i32 %sub
+
+if.else:
+  ret i32 0
+}
+
+define i32 @sub_infer_nuw_from_domcond_lhs_is_false(i32 %x, i32 %y) {
+; CHECK-LABEL: @sub_infer_nuw_from_domcond_lhs_is_false(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ult i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[X]], [[Y]]
+; CHECK-NEXT:    ret i32 [[SUB]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i32 0
+;
+  %cond = icmp uge i32 %x, %y
+  br i1 %cond, label %if.else, label %if.then
+
+if.then:
+  %sub = sub i32 %x, %y
+  ret i32 %sub
+
+if.else:
+  ret i32 0
+}

>From 5bae81ba9e6db16fac3cc0ee96b134f3f588c8f4 Mon Sep 17 00:00:00 2001
From: Krzysztof Pszeniczny <kpszeniczny at google.com>
Date: Wed, 24 Jul 2024 17:43:26 +0200
Subject: [PATCH 031/259] [CodeGen] Add an option to skip extTSP BB placement
 for huge functions. (#99310)

The extTSP-based basic block layout algorithm improves the performance
of the generated code, but unfortunately it has a super-linear time
complexity. This leads to extremely long compilation times for certain
relatively rare kinds of autogenerated code.

This patch adds an `-mllvm` flag to optionally restrict extTSP only to
functions smaller than a specified threshold. While commit
bcdc0477319a26fd8dcdde5ace3bdd6743599f44 added a knob to to limit the
maximum chain size, it's still possible that for certain huge functions
the number of chains is very large, leading to a quadratic behaviour in
ExtTSPImpl::mergeChainPairs.
---
 llvm/lib/CodeGen/MachineBlockPlacement.cpp        |  9 ++++++++-
 .../CodeGen/X86/code_placement_ext_tsp_large.ll   | 15 +++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index cb352a54ae222..6179e0938034a 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -213,6 +213,12 @@ static cl::opt<bool> RenumberBlocksBeforeView(
         "into a dot graph. Only used when a function is being printed."),
     cl::init(false), cl::Hidden);
 
+static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks(
+    "ext-tsp-block-placement-max-blocks",
+    cl::desc("Maximum number of basic blocks in a function to run ext-TSP "
+             "block placement."),
+    cl::init(UINT_MAX), cl::Hidden);
+
 namespace llvm {
 extern cl::opt<bool> EnableExtTspBlockPlacement;
 extern cl::opt<bool> ApplyExtTspWithoutProfile;
@@ -3523,7 +3529,8 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
 
   // Apply a post-processing optimizing block placement.
   if (MF.size() >= 3 && EnableExtTspBlockPlacement &&
-      (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData())) {
+      (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData()) &&
+      MF.size() <= ExtTspBlockPlacementMaxBlocks) {
     // Find a new placement and modify the layout of the blocks in the function.
     applyExtTsp();
 
diff --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll
index 842aced4884f7..ac172d32c6d8b 100644
--- a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll
+++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 -ext-tsp-chain-split-threshold=128 -debug-only=block-placement < %s 2>&1 | FileCheck %s
 ; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 -ext-tsp-chain-split-threshold=1 -debug-only=block-placement < %s 2>&1 | FileCheck %s -check-prefix=CHECK2
 ; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=0 -debug-only=block-placement < %s 2>&1 | FileCheck %s -check-prefix=CHECK3
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 -ext-tsp-block-placement-max-blocks=8 -debug-only=block-placement < %s 2>&1 | FileCheck %s -check-prefix=CHECK4
 
 @yydebug = dso_local global i32 0, align 4
 
@@ -110,6 +111,20 @@ define void @func_large() !prof !0 {
 ; CHECK3: b7
 ; CHECK3: b8
 ; CHECK3: b9
+;
+; An expected output with function size larger than the threshold -- the layout is not modified:
+;
+; CHECK4-LABEL: func_large:
+; CHECK4: b0
+; CHECK4: b1
+; CHECK4: b2
+; CHECK4: b3
+; CHECK4: b4
+; CHECK4: b5
+; CHECK4: b6
+; CHECK4: b7
+; CHECK4: b8
+; CHECK4: b9
 
 b0:
   %0 = load i32, ptr @yydebug, align 4

>From c5abcb0521dd81bfd8b736c5d03177cc3a2d0fe3 Mon Sep 17 00:00:00 2001
From: Evgenii Kudriashov <evgenii.kudriashov at intel.com>
Date: Wed, 24 Jul 2024 18:47:27 +0300
Subject: [PATCH 032/259] [X86][GlobalISel] Reorganize tests for G_PHI and
 support fp80 (#100047)

* Verified scalar support of G_PHI
* Added cases with i1, ptr and x86_fp80 types
* Added phi inputs variability

We don't enable tests for i686 due to lack of float/integer types in
legalizer. We don't distinguish between illegal 64 bit integer G_LOAD
and legal double load using X87. It leads to artificial G_MERGE_VALUES
of floats into a single double.
---
 .../lib/Target/X86/GISel/X86LegalizerInfo.cpp |   3 +-
 llvm/test/CodeGen/X86/GlobalISel/phi.ll       | 172 ------
 llvm/test/CodeGen/X86/isel-phi.ll             | 494 ++++++++++++++++++
 3 files changed, 496 insertions(+), 173 deletions(-)
 delete mode 100644 llvm/test/CodeGen/X86/GlobalISel/phi.ll
 create mode 100644 llvm/test/CodeGen/X86/isel-phi.ll

diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index dd8ecf6ef7fc7..7169d588548b0 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -315,7 +315,8 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
   getActionDefinitionsBuilder(G_PHI)
       .legalIf([=](const LegalityQuery &Query) -> bool {
         return typeInSet(0, {s8, s16, s32, p0})(Query) ||
-               (Is64Bit && typeInSet(0, {s64})(Query)) ||
+               (UseX87 && typeIs(0, s80)(Query)) ||
+               (Is64Bit && typeIs(0, s64)(Query)) ||
                (HasSSE1 && typeInSet(0, {v16s8, v8s16, v4s32, v2s64})(Query)) ||
                (HasAVX && typeInSet(0, {v32s8, v16s16, v8s32, v4s64})(Query)) ||
                (HasAVX512 &&
diff --git a/llvm/test/CodeGen/X86/GlobalISel/phi.ll b/llvm/test/CodeGen/X86/GlobalISel/phi.ll
deleted file mode 100644
index b29540f002598..0000000000000
--- a/llvm/test/CodeGen/X86/GlobalISel/phi.ll
+++ /dev/null
@@ -1,172 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL
-
-define i8 @test_i8(i32 %a, i8 %f, i8 %t) {
-; ALL-LABEL: test_i8:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    cmpl $0, %edi
-; ALL-NEXT:    setg %cl
-; ALL-NEXT:    testb $1, %cl
-; ALL-NEXT:    je .LBB0_2
-; ALL-NEXT:  # %bb.1:
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    # kill: def $al killed $al killed $eax
-; ALL-NEXT:    retq
-; ALL-NEXT:  .LBB0_2: # %cond.false
-; ALL-NEXT:    movl %edx, %eax
-; ALL-NEXT:    # kill: def $al killed $al killed $eax
-; ALL-NEXT:    retq
-entry:
-  %cmp = icmp sgt i32 %a, 0
-  br i1 %cmp, label %cond.true, label %cond.false
-
-cond.true:                                        ; preds = %entry
-  br label %cond.end
-
-cond.false:                                       ; preds = %entry
-  br label %cond.end
-
-cond.end:                                         ; preds = %cond.false, %cond.true
-  %cond = phi i8 [ %f, %cond.true ], [ %t, %cond.false ]
-  ret i8 %cond
-}
-
-define i16 @test_i16(i32 %a, i16 %f, i16 %t) {
-; ALL-LABEL: test_i16:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    cmpl $0, %edi
-; ALL-NEXT:    setg %cl
-; ALL-NEXT:    testb $1, %cl
-; ALL-NEXT:    je .LBB1_2
-; ALL-NEXT:  # %bb.1:
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    # kill: def $ax killed $ax killed $eax
-; ALL-NEXT:    retq
-; ALL-NEXT:  .LBB1_2: # %cond.false
-; ALL-NEXT:    movl %edx, %eax
-; ALL-NEXT:    # kill: def $ax killed $ax killed $eax
-; ALL-NEXT:    retq
-entry:
-  %cmp = icmp sgt i32 %a, 0
-  br i1 %cmp, label %cond.true, label %cond.false
-
-cond.true:                                        ; preds = %entry
-  br label %cond.end
-
-cond.false:                                       ; preds = %entry
-  br label %cond.end
-
-cond.end:                                         ; preds = %cond.false, %cond.true
-  %cond = phi i16 [ %f, %cond.true ], [ %t, %cond.false ]
-  ret i16 %cond
-}
-
-define i32 @test_i32(i32 %a, i32 %f, i32 %t) {
-; ALL-LABEL: test_i32:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    cmpl $0, %edi
-; ALL-NEXT:    setg %cl
-; ALL-NEXT:    testb $1, %cl
-; ALL-NEXT:    je .LBB2_1
-; ALL-NEXT:  # %bb.2: # %cond.end
-; ALL-NEXT:    retq
-; ALL-NEXT:  .LBB2_1: # %cond.false
-; ALL-NEXT:    movl %edx, %eax
-; ALL-NEXT:    retq
-entry:
-  %cmp = icmp sgt i32 %a, 0
-  br i1 %cmp, label %cond.true, label %cond.false
-
-cond.true:                                        ; preds = %entry
-  br label %cond.end
-
-cond.false:                                       ; preds = %entry
-  br label %cond.end
-
-cond.end:                                         ; preds = %cond.false, %cond.true
-  %cond = phi i32 [ %f, %cond.true ], [ %t, %cond.false ]
-  ret i32 %cond
-}
-
-define i64 @test_i64(i32 %a, i64 %f, i64 %t) {
-; ALL-LABEL: test_i64:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    movq %rsi, %rax
-; ALL-NEXT:    cmpl $0, %edi
-; ALL-NEXT:    setg %cl
-; ALL-NEXT:    testb $1, %cl
-; ALL-NEXT:    je .LBB3_1
-; ALL-NEXT:  # %bb.2: # %cond.end
-; ALL-NEXT:    retq
-; ALL-NEXT:  .LBB3_1: # %cond.false
-; ALL-NEXT:    movq %rdx, %rax
-; ALL-NEXT:    retq
-entry:
-  %cmp = icmp sgt i32 %a, 0
-  br i1 %cmp, label %cond.true, label %cond.false
-
-cond.true:                                        ; preds = %entry
-  br label %cond.end
-
-cond.false:                                       ; preds = %entry
-  br label %cond.end
-
-cond.end:                                         ; preds = %cond.false, %cond.true
-  %cond = phi i64 [ %f, %cond.true ], [ %t, %cond.false ]
-  ret i64 %cond
-}
-
-define float @test_float(i32 %a, float %f, float %t) {
-; ALL-LABEL: test_float:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    cmpl $0, %edi
-; ALL-NEXT:    setg %al
-; ALL-NEXT:    testb $1, %al
-; ALL-NEXT:    je .LBB4_1
-; ALL-NEXT:  # %bb.2: # %cond.end
-; ALL-NEXT:    retq
-; ALL-NEXT:  .LBB4_1: # %cond.false
-; ALL-NEXT:    movaps %xmm1, %xmm0
-; ALL-NEXT:    retq
-entry:
-  %cmp = icmp sgt i32 %a, 0
-  br i1 %cmp, label %cond.true, label %cond.false
-
-cond.true:                                        ; preds = %entry
-  br label %cond.end
-
-cond.false:                                       ; preds = %entry
-  br label %cond.end
-
-cond.end:                                         ; preds = %cond.false, %cond.true
-  %cond = phi float [ %f, %cond.true ], [ %t, %cond.false ]
-  ret float %cond
-}
-
-define double @test_double(i32 %a, double %f, double %t) {
-; ALL-LABEL: test_double:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    cmpl $0, %edi
-; ALL-NEXT:    setg %al
-; ALL-NEXT:    testb $1, %al
-; ALL-NEXT:    je .LBB5_1
-; ALL-NEXT:  # %bb.2: # %cond.end
-; ALL-NEXT:    retq
-; ALL-NEXT:  .LBB5_1: # %cond.false
-; ALL-NEXT:    movaps %xmm1, %xmm0
-; ALL-NEXT:    retq
-entry:
-  %cmp = icmp sgt i32 %a, 0
-  br i1 %cmp, label %cond.true, label %cond.false
-
-cond.true:                                        ; preds = %entry
-  br label %cond.end
-
-cond.false:                                       ; preds = %entry
-  br label %cond.end
-
-cond.end:                                         ; preds = %cond.false, %cond.true
-  %cond = phi double [ %f, %cond.true ], [ %t, %cond.false ]
-  ret double %cond
-}
diff --git a/llvm/test/CodeGen/X86/isel-phi.ll b/llvm/test/CodeGen/X86/isel-phi.ll
new file mode 100644
index 0000000000000..412aeac5eed6a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-phi.ll
@@ -0,0 +1,494 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i686-linux-gnu -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=X86
+; RUN: llc -mtriple=i686-linux-gnu -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=X86
+; TODO: enable when x87 is supported
+; llc -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=X86,GLOBAL-X86
+; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=X64,DAG-X64
+; RUN: llc -mtriple=x86_64-linux-gnu -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=X64,DAG-X64
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=X64,GLOBAL-X64
+
+define i1 @test_i1(i1 %a, i1 %b, i1 %c, i1 %pred0, i1 %pred1) {
+; X86-LABEL: test_i1:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB0_1
+; X86-NEXT:  # %bb.2: # %cond
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB0_4
+; X86-NEXT:  # %bb.3:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_1:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_4: # %cond.false
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_i1:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    je .LBB0_1
+; X64-NEXT:  # %bb.2: # %cond
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    testb $1, %r8b
+; X64-NEXT:    jne .LBB0_4
+; X64-NEXT:  # %bb.3: # %cond.false
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:  .LBB0_4: # %cond.end
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB0_1:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+entry:
+  br i1 %pred0, label %cond, label %cond.end
+
+cond:
+  br i1 %pred1, label %cond.true, label %cond.false
+
+cond.true:
+  br label %cond.end
+
+cond.false:
+  br label %cond.end
+
+cond.end:
+  %res = phi i1 [ %a, %entry ], [ %b, %cond.true ], [ %c, %cond.false ]
+  ret i1 %res
+}
+
+define i8 @test_i8(i8 %f, i8 %t, i1 %pred) {
+; X86-LABEL: test_i8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB1_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB1_2: # %cond.false
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_i8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    testb $1, %dl
+; X64-NEXT:    je .LBB1_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB1_2: # %cond.false
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+entry:
+  br i1 %pred, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i8 [ %f, %cond.true ], [ %t, %cond.false ]
+  ret i8 %cond
+}
+
+define i16 @test_i16(i16 %f, i16 %t, i1 %pred) {
+; X86-LABEL: test_i16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB2_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB2_2: # %cond.false
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_i16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    testb $1, %dl
+; X64-NEXT:    je .LBB2_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB2_2: # %cond.false
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+entry:
+  br i1 %pred, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i16 [ %f, %cond.true ], [ %t, %cond.false ]
+  ret i16 %cond
+}
+
+define i32 @test_i32(i32 %f, i32 %t, i1 %pred) {
+; X86-LABEL: test_i32:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB3_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB3_2: # %cond.false
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_i32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    testb $1, %dl
+; X64-NEXT:    jne .LBB3_2
+; X64-NEXT:  # %bb.1: # %cond.false
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:  .LBB3_2: # %cond.end
+; X64-NEXT:    retq
+entry:
+  br i1 %pred, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ %f, %cond.true ], [ %t, %cond.false ]
+  ret i32 %cond
+}
+
+define i64 @test_i64(i64 %f, i64 %t, i1 %pred) {
+; X86-LABEL: test_i64:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB4_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB4_2: # %cond.false
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_i64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    testb $1, %dl
+; X64-NEXT:    jne .LBB4_2
+; X64-NEXT:  # %bb.1: # %cond.false
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:  .LBB4_2: # %cond.end
+; X64-NEXT:    retq
+entry:
+  br i1 %pred, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i64 [ %f, %cond.true ], [ %t, %cond.false ]
+  ret i64 %cond
+}
+
+define float @test_float(float %f, float %t, i1 %pred) {
+; X86-LABEL: test_float:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    jne .LBB5_2
+; X86-NEXT:  # %bb.1: # %cond.false
+; X86-NEXT:    fstp %st(0)
+; X86-NEXT:    fldz
+; X86-NEXT:    fxch %st(1)
+; X86-NEXT:  .LBB5_2: # %cond.end
+; X86-NEXT:    fstp %st(1)
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_float:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    jne .LBB5_2
+; X64-NEXT:  # %bb.1: # %cond.false
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:  .LBB5_2: # %cond.end
+; X64-NEXT:    retq
+entry:
+  br i1 %pred, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi float [ %f, %cond.true ], [ %t, %cond.false ]
+  ret float %cond
+}
+
+define double @test_double(i32 %a, double %f, double %t, i1 %pred) {
+; X86-LABEL: test_double:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    jne .LBB6_2
+; X86-NEXT:  # %bb.1: # %cond.false
+; X86-NEXT:    fstp %st(0)
+; X86-NEXT:    fldz
+; X86-NEXT:    fxch %st(1)
+; X86-NEXT:  .LBB6_2: # %cond.end
+; X86-NEXT:    fstp %st(1)
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_double:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    testb $1, %sil
+; X64-NEXT:    jne .LBB6_2
+; X64-NEXT:  # %bb.1: # %cond.false
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:  .LBB6_2: # %cond.end
+; X64-NEXT:    retq
+entry:
+  br i1 %pred, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi double [ %f, %cond.true ], [ %t, %cond.false ]
+  ret double %cond
+}
+
+define ptr @test_ptr(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e, ptr %f, ptr %g, i1 %pred0, i1 %pred1, i1 %pred2) {
+; X86-LABEL: test_ptr:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB7_6
+; X86-NEXT:  # %bb.1: # %cond.true
+; X86-NEXT:    testb $1, %cl
+; X86-NEXT:    je .LBB7_3
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_6: # %cond.false
+; X86-NEXT:    testb $1, %cl
+; X86-NEXT:    je .LBB7_10
+; X86-NEXT:  # %bb.7: # %cond.false.true
+; X86-NEXT:    testb $1, %al
+; X86-NEXT:    je .LBB7_9
+; X86-NEXT:  # %bb.8:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_3: # %cond.true.false
+; X86-NEXT:    testb $1, %al
+; X86-NEXT:    je .LBB7_5
+; X86-NEXT:  # %bb.4:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_10: # %cond.false.false
+; X86-NEXT:    testb $1, %al
+; X86-NEXT:    je .LBB7_12
+; X86-NEXT:  # %bb.11:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_9: # %cond.false.true.false
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_5: # %cond.true.false.false
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_12: # %cond.false.false.false
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; DAG-X64-LABEL: test_ptr:
+; DAG-X64:       # %bb.0: # %entry
+; DAG-X64-NEXT:    movq %rdi, %rax
+; DAG-X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; DAG-X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
+; DAG-X64-NEXT:    testb $1, {{[0-9]+}}(%rsp)
+; DAG-X64-NEXT:    je .LBB7_4
+; DAG-X64-NEXT:  # %bb.1: # %cond.true
+; DAG-X64-NEXT:    testb $1, %r10b
+; DAG-X64-NEXT:    jne .LBB7_9
+; DAG-X64-NEXT:  # %bb.2: # %cond.true.false
+; DAG-X64-NEXT:    testb $1, %dil
+; DAG-X64-NEXT:    movq %rsi, %rax
+; DAG-X64-NEXT:    jne .LBB7_9
+; DAG-X64-NEXT:  # %bb.3: # %cond.true.false.false
+; DAG-X64-NEXT:    movq %rdx, %rax
+; DAG-X64-NEXT:    retq
+; DAG-X64-NEXT:  .LBB7_4: # %cond.false
+; DAG-X64-NEXT:    testb $1, %r10b
+; DAG-X64-NEXT:    je .LBB7_7
+; DAG-X64-NEXT:  # %bb.5: # %cond.false.true
+; DAG-X64-NEXT:    testb $1, %dil
+; DAG-X64-NEXT:    movq %rcx, %rax
+; DAG-X64-NEXT:    jne .LBB7_9
+; DAG-X64-NEXT:  # %bb.6: # %cond.false.true.false
+; DAG-X64-NEXT:    movq %r8, %rax
+; DAG-X64-NEXT:    retq
+; DAG-X64-NEXT:  .LBB7_7: # %cond.false.false
+; DAG-X64-NEXT:    testb $1, %dil
+; DAG-X64-NEXT:    movq %r9, %rax
+; DAG-X64-NEXT:    jne .LBB7_9
+; DAG-X64-NEXT:  # %bb.8: # %cond.false.false.false
+; DAG-X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; DAG-X64-NEXT:  .LBB7_9: # %cond.end
+; DAG-X64-NEXT:    retq
+;
+; GLOBAL-X64-LABEL: test_ptr:
+; GLOBAL-X64:       # %bb.0: # %entry
+; GLOBAL-X64-NEXT:    movq %rdi, %rax
+; GLOBAL-X64-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; GLOBAL-X64-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; GLOBAL-X64-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; GLOBAL-X64-NEXT:    testb $1, %r11b
+; GLOBAL-X64-NEXT:    je .LBB7_4
+; GLOBAL-X64-NEXT:  # %bb.1: # %cond.true
+; GLOBAL-X64-NEXT:    testb $1, %r10b
+; GLOBAL-X64-NEXT:    jne .LBB7_9
+; GLOBAL-X64-NEXT:  # %bb.2: # %cond.true.false
+; GLOBAL-X64-NEXT:    testb $1, %dil
+; GLOBAL-X64-NEXT:    movq %rsi, %rax
+; GLOBAL-X64-NEXT:    jne .LBB7_9
+; GLOBAL-X64-NEXT:  # %bb.3: # %cond.true.false.false
+; GLOBAL-X64-NEXT:    movq %rdx, %rax
+; GLOBAL-X64-NEXT:    retq
+; GLOBAL-X64-NEXT:  .LBB7_4: # %cond.false
+; GLOBAL-X64-NEXT:    testb $1, %r10b
+; GLOBAL-X64-NEXT:    je .LBB7_7
+; GLOBAL-X64-NEXT:  # %bb.5: # %cond.false.true
+; GLOBAL-X64-NEXT:    testb $1, %dil
+; GLOBAL-X64-NEXT:    movq %rcx, %rax
+; GLOBAL-X64-NEXT:    jne .LBB7_9
+; GLOBAL-X64-NEXT:  # %bb.6: # %cond.false.true.false
+; GLOBAL-X64-NEXT:    movq %r8, %rax
+; GLOBAL-X64-NEXT:    retq
+; GLOBAL-X64-NEXT:  .LBB7_7: # %cond.false.false
+; GLOBAL-X64-NEXT:    testb $1, %dil
+; GLOBAL-X64-NEXT:    movq %r9, %rax
+; GLOBAL-X64-NEXT:    jne .LBB7_9
+; GLOBAL-X64-NEXT:  # %bb.8: # %cond.false.false.false
+; GLOBAL-X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; GLOBAL-X64-NEXT:  .LBB7_9: # %cond.end
+; GLOBAL-X64-NEXT:    retq
+entry:
+  br i1 %pred0, label %cond.true, label %cond.false
+
+cond.true:
+  br i1 %pred1, label %cond.end, label %cond.true.false
+
+cond.true.false:
+  br i1 %pred2, label %cond.true.false.true, label %cond.true.false.false
+
+cond.true.false.true:
+  br label %cond.end
+
+cond.true.false.false:
+  br label %cond.end
+
+cond.false:
+  br i1 %pred1, label %cond.false.true, label %cond.false.false
+
+cond.false.true:
+  br i1 %pred2, label %cond.false.true.true, label %cond.false.true.false
+
+cond.false.true.true:
+  br label %cond.end
+
+cond.false.true.false:
+  br label %cond.end
+
+cond.false.false:
+  br i1 %pred2, label %cond.false.false.true, label %cond.false.false.false
+
+cond.false.false.true:
+  br label %cond.end
+
+cond.false.false.false:
+  br label %cond.end
+
+cond.end:
+  %res = phi ptr [ %a, %cond.true ], [ %b, %cond.true.false.true ], [ %c, %cond.true.false.false ], [ %d, %cond.false.true.true ], [ %e, %cond.false.true.false ], [ %f, %cond.false.false.true ], [ %g, %cond.false.false.false ]
+  ret ptr %res
+}
+
+define x86_fp80 @test_fp80(x86_fp80 %f, x86_fp80 %t, i1 %pred) {
+; X86-LABEL: test_fp80:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    jne .LBB8_2
+; X86-NEXT:  # %bb.1: # %cond.false
+; X86-NEXT:    fstp %st(0)
+; X86-NEXT:    fldz
+; X86-NEXT:    fxch %st(1)
+; X86-NEXT:  .LBB8_2: # %cond.end
+; X86-NEXT:    fstp %st(1)
+; X86-NEXT:    retl
+;
+; DAG-X64-LABEL: test_fp80:
+; DAG-X64:       # %bb.0: # %entry
+; DAG-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; DAG-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; DAG-X64-NEXT:    testb $1, %dil
+; DAG-X64-NEXT:    jne .LBB8_2
+; DAG-X64-NEXT:  # %bb.1: # %cond.false
+; DAG-X64-NEXT:    fstp %st(0)
+; DAG-X64-NEXT:    fldz
+; DAG-X64-NEXT:    fxch %st(1)
+; DAG-X64-NEXT:  .LBB8_2: # %cond.end
+; DAG-X64-NEXT:    fstp %st(1)
+; DAG-X64-NEXT:    retq
+;
+; GLOBAL-X64-LABEL: test_fp80:
+; GLOBAL-X64:       # %bb.0: # %entry
+; GLOBAL-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; GLOBAL-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; GLOBAL-X64-NEXT:    testb $1, %dil
+; GLOBAL-X64-NEXT:    jne .LBB8_2
+; GLOBAL-X64-NEXT:  # %bb.1: # %cond.false
+; GLOBAL-X64-NEXT:    fstp %st(1)
+; GLOBAL-X64-NEXT:    fldz
+; GLOBAL-X64-NEXT:  .LBB8_2: # %cond.end
+; GLOBAL-X64-NEXT:    fstp %st(0)
+; GLOBAL-X64-NEXT:    retq
+entry:
+  br i1 %pred, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi x86_fp80 [ %f, %cond.true ], [ %t, %cond.false ]
+  ret x86_fp80 %cond
+}

>From 2754c083cba1798b335947ad5e4f0d654b013904 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Wed, 24 Jul 2024 16:52:11 +0100
Subject: [PATCH 033/259] LAA: mark LoopInfo pointer const (NFC) (#100373)

---
 llvm/include/llvm/Analysis/ValueTracking.h | 2 +-
 llvm/lib/Analysis/LoopAccessAnalysis.cpp   | 3 ++-
 llvm/lib/Analysis/ValueTracking.cpp        | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 8e5a8dc483d27..5ef6e43483906 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -770,7 +770,7 @@ const Value *getUnderlyingObjectAggressive(const Value *V);
 /// it shouldn't look through the phi above.
 void getUnderlyingObjects(const Value *V,
                           SmallVectorImpl<const Value *> &Objects,
-                          LoopInfo *LI = nullptr, unsigned MaxLookup = 6);
+                          const LoopInfo *LI = nullptr, unsigned MaxLookup = 6);
 
 /// This is a wrapper around getUnderlyingObjects and adds support for basic
 /// ptrtoint+arithmetic+inttoptr sequences.
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 84214c47a10e1..37320720c5329 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -786,7 +786,8 @@ class AccessAnalysis {
   //intrinsic property (such as TBAA metadata).
   AliasSetTracker AST;
 
-  LoopInfo *LI;
+  /// The LoopInfo of the loop being checked.
+  const LoopInfo *LI;
 
   /// Sets of potentially dependent accesses - members of one set share an
   /// underlying pointer. The set "CheckDeps" identfies which sets really need a
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 28eee9b59c043..bfd26fadd237b 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -6580,7 +6580,7 @@ const Value *llvm::getUnderlyingObject(const Value *V, unsigned MaxLookup) {
 
 void llvm::getUnderlyingObjects(const Value *V,
                                 SmallVectorImpl<const Value *> &Objects,
-                                LoopInfo *LI, unsigned MaxLookup) {
+                                const LoopInfo *LI, unsigned MaxLookup) {
   SmallPtrSet<const Value *, 4> Visited;
   SmallVector<const Value *, 4> Worklist;
   Worklist.push_back(V);

>From 8c20d715c0989eb3a9e253a53ce9d25ad6e5ee10 Mon Sep 17 00:00:00 2001
From: Dmitry Chernenkov <dmitryc at google.com>
Date: Wed, 24 Jul 2024 15:55:57 +0000
Subject: [PATCH 034/259] [Bazel][mlir] Fix Bazel for
 f83950ab8dfda1da882a6ef7b508639df251621a

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel      | 2 ++
 utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel | 1 +
 2 files changed, 3 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 0d0224e6d6802..536c8256b4274 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -7158,6 +7158,7 @@ cc_library(
         ":DialectUtils",
         ":FuncDialect",
         ":IR",
+        ":Pass",
         ":SPIRVDialect",
         ":Support",
         ":TransformUtils",
@@ -8319,6 +8320,7 @@ cc_library(
         ":TransformUtils",
         ":Transforms",
         ":UBToSPIRV",
+        ":VectorDialect",
         ":VectorToSPIRV",
         ":VectorTransforms",
     ],
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index a1d2b20a106e6..34beb758a12dd 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -668,6 +668,7 @@ cc_library(
         "//mlir:TransformUtils",
         "//mlir:Transforms",
         "//mlir:VectorDialect",
+        "//mlir:VectorTransforms",
     ],
 )
 

>From af611a0a98fec65ab86af4208ca74e01541baba1 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 24 Jul 2024 11:00:16 -0500
Subject: [PATCH 035/259] [LinkerWrapper] Fix `-Xoffload-linker a=b` in
 offloading (#100270)

Summary:
We have the `-Xoffload-linker=triple=arg` syntax that split the argument
meant only for a single toolchain. However this borke if it was an `a=b`
type argument. Make it only treat it like a triple if it's a valid
triple.
---
 clang/test/Driver/linker-wrapper.c                      | 7 ++++---
 clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp | 8 +++++++-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index 63d43921be9ac..342907c1a3390 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -128,11 +128,12 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \
 // RUN:   -fembed-offload-object=%t.out
 // RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu \
-// RUN:   --linker-path=/usr/bin/ld --device-linker=a --device-linker=nvptx64-nvidia-cuda=b \
+// RUN:   --linker-path=/usr/bin/ld --device-linker=foo=bar --device-linker=a \
+// RUN:   --device-linker=nvptx64-nvidia-cuda=b \
 // RUN:   %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=LINKER-ARGS
 
-// LINKER-ARGS: clang{{.*}}--target=amdgcn-amd-amdhsa{{.*}}a
-// LINKER-ARGS: clang{{.*}}--target=nvptx64-nvidia-cuda{{.*}}a b
+// LINKER-ARGS: clang{{.*}}--target=amdgcn-amd-amdhsa{{.*}}foo=bar{{.*}}a
+// LINKER-ARGS: clang{{.*}}--target=nvptx64-nvidia-cuda{{.*}}foo=bar{{.*}}a b
 
 // RUN: not clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu -ldummy \
 // RUN:   --linker-path=/usr/bin/ld --device-linker=a --device-linker=nvptx64-nvidia-cuda=b \
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 9a38e4fb098f9..4bb021eae25a8 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -1214,7 +1214,13 @@ DerivedArgList getLinkerArgs(ArrayRef<OffloadFile> Input,
   // Forward '-Xoffload-linker' options to the appropriate backend.
   for (StringRef Arg : Args.getAllArgValues(OPT_device_linker_args_EQ)) {
     auto [Triple, Value] = Arg.split('=');
-    if (Value.empty())
+    llvm::Triple TT(Triple);
+    // If this isn't a recognized triple then it's an `arg=value` option.
+    if (TT.getArch() <= Triple::ArchType::UnknownArch ||
+        TT.getArch() > Triple::ArchType::LastArchType)
+      DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_linker_arg_EQ),
+                       Args.MakeArgString(Arg));
+    else if (Value.empty())
       DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_linker_arg_EQ),
                        Args.MakeArgString(Triple));
     else if (Triple == DAL.getLastArgValue(OPT_triple_EQ))

>From d85da4af4983fb8997865014ca5f87ad2db5e272 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 24 Jul 2024 16:59:17 +0100
Subject: [PATCH 036/259] [X86] canCreateUndefOrPoisonForTargetNode -
 PMADDWD/PMADDUBSW intrinsics don't create poison

Fix regression introduced by #84924
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 10 ++++++
 llvm/test/CodeGen/X86/combine-pmadd.ll  | 47 ++++---------------------
 2 files changed, 16 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 14a6a91262abc..7f2c98db83304 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43328,6 +43328,16 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
   case X86ISD::UNPCKH:
   case X86ISD::UNPCKL:
     return false;
+  case ISD::INTRINSIC_WO_CHAIN:
+    switch (Op->getConstantOperandVal(0)) {
+    case Intrinsic::x86_sse2_pmadd_wd:
+    case Intrinsic::x86_avx2_pmadd_wd:
+    case Intrinsic::x86_avx512_pmaddw_d_512:
+    case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
+    case Intrinsic::x86_avx2_pmadd_ub_sw:
+    case Intrinsic::x86_avx512_pmaddubs_w_512:
+      return false;
+    }
   }
   return TargetLowering::canCreateUndefOrPoisonForTargetNode(
       Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
diff --git a/llvm/test/CodeGen/X86/combine-pmadd.ll b/llvm/test/CodeGen/X86/combine-pmadd.ll
index d011efa5b6140..eb627f13db08f 100644
--- a/llvm/test/CodeGen/X86/combine-pmadd.ll
+++ b/llvm/test/CodeGen/X86/combine-pmadd.ll
@@ -82,10 +82,8 @@ define <8 x i32> @combine_pmaddwd_concat_freeze(<8 x i16> %a0, <8 x i16> %a1) {
 ; AVX2-LABEL: combine_pmaddwd_concat_freeze:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm2, %ymm1
-; AVX2-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX2-NEXT:    retq
   %lo = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   %hi = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
@@ -229,10 +227,8 @@ define <16 x i16> @combine_pmaddubsw_concat_freeze(<16 x i8> %a0, <16 x i8> %a1)
 ; AVX2-LABEL: combine_pmaddubsw_concat_freeze:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm2, %ymm1
-; AVX2-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX2-NEXT:    retq
   %lo = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   %hi = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
@@ -293,41 +289,10 @@ define i32 @combine_pmaddubsw_constant_sat() {
 
 ; Constant folding PMADDWD was causing an infinite loop in the PCMPGT commuting between 2 constant values.
 define i1 @pmaddwd_pcmpgt_infinite_loop() {
-; SSE-LABEL: pmaddwd_pcmpgt_infinite_loop:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
-; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $1, %xmm1
-; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movmskps %xmm1, %eax
-; SSE-NEXT:    testl %eax, %eax
-; SSE-NEXT:    sete %al
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: pmaddwd_pcmpgt_infinite_loop:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vtestps %xmm1, %xmm0
-; AVX1-NEXT:    sete %al
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: pmaddwd_pcmpgt_infinite_loop:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrld $1, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vtestps %xmm1, %xmm0
-; AVX2-NEXT:    sete %al
-; AVX2-NEXT:    retq
+; CHECK-LABEL: pmaddwd_pcmpgt_infinite_loop:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    retq
   %1 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <8 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>)
   %2 = icmp eq <4 x i32> %1, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
   %3 = select <4 x i1> %2, <4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, <4 x i32> zeroinitializer

>From 073c199ae79f4bf9a89fcbf103cbdd6333c8512b Mon Sep 17 00:00:00 2001
From: Krystian Stasiowski <sdkrystian at gmail.com>
Date: Wed, 24 Jul 2024 12:02:45 -0400
Subject: [PATCH 037/259] [Clang][AST] Don't use canonical type when checking
 dependence in Type::isOverloadable (#98563)

Fixes #97646.
---
 clang/docs/ReleaseNotes.rst                          | 2 ++
 clang/include/clang/AST/Type.h                       | 2 +-
 clang/test/SemaCXX/decltype.cpp                      | 8 ++++++++
 clang/test/SemaCXX/{typeof_unqual.cpp => typeof.cpp} | 8 ++++++++
 4 files changed, 19 insertions(+), 1 deletion(-)
 rename clang/test/SemaCXX/{typeof_unqual.cpp => typeof.cpp} (68%)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c80c2ebbac9a7..65de90f69e198 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -147,6 +147,8 @@ Bug Fixes to Attribute Support
 Bug Fixes to C++ Support
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Fixed a crash when an expression with a dependent ``__typeof__`` type is used as the operand of a unary operator. (#GH97646)
+
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 25defea58c2dc..72723c7c56e07 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -8434,7 +8434,7 @@ inline bool Type::isUndeducedType() const {
 /// Determines whether this is a type for which one can define
 /// an overloaded operator.
 inline bool Type::isOverloadableType() const {
-  if (!CanonicalType->isDependentType())
+  if (!isDependentType())
     return isRecordType() || isEnumeralType();
   return !isArrayType() && !isFunctionType() && !isAnyPointerType() &&
          !isMemberPointerType();
diff --git a/clang/test/SemaCXX/decltype.cpp b/clang/test/SemaCXX/decltype.cpp
index 96abb60836e40..9961c5e2bd918 100644
--- a/clang/test/SemaCXX/decltype.cpp
+++ b/clang/test/SemaCXX/decltype.cpp
@@ -139,6 +139,14 @@ namespace GH58674 {
   }
 }
 
+namespace GH97646 {
+  template<bool B>
+  void f() {
+    decltype(B) x = false;
+    !x;
+  }
+}
+
 template<typename>
 class conditional {
 };
diff --git a/clang/test/SemaCXX/typeof_unqual.cpp b/clang/test/SemaCXX/typeof.cpp
similarity index 68%
rename from clang/test/SemaCXX/typeof_unqual.cpp
rename to clang/test/SemaCXX/typeof.cpp
index 335e57995377c..421cfc59f5311 100644
--- a/clang/test/SemaCXX/typeof_unqual.cpp
+++ b/clang/test/SemaCXX/typeof.cpp
@@ -3,3 +3,11 @@
 typeof_unqual(int) u = 12; // expected-error {{expected function body after function declarator}}
 __typeof_unqual(int) _u = 12;
 __typeof_unqual__(int) __u = 12;
+
+namespace GH97646 {
+  template<bool B>
+  void f() {
+    __typeof__(B) x = false;
+    !x;
+  }
+}

>From 046a17717d9c5b5385ecd914621b48bdd91524d0 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Wed, 24 Jul 2024 11:03:52 -0500
Subject: [PATCH 038/259] [libc++] Improve behavior when using relative path
 for LIBCXX_ASSERTION_HANDLER_FILE (#100157)

Fixes #80696
---
 libcxx/CMakeLists.txt          | 8 ++++++--
 libcxx/docs/BuildingLibcxx.rst | 3 ++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 332816b15260a..674082c7d1787 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -71,12 +71,16 @@ if (NOT "${LIBCXX_HARDENING_MODE}" IN_LIST LIBCXX_SUPPORTED_HARDENING_MODES)
     "Unsupported hardening mode: '${LIBCXX_HARDENING_MODE}'. Supported values are ${LIBCXX_SUPPORTED_HARDENING_MODES}.")
 endif()
 set(LIBCXX_ASSERTION_HANDLER_FILE
-  "${CMAKE_CURRENT_SOURCE_DIR}/vendor/llvm/default_assertion_handler.in"
+  "vendor/llvm/default_assertion_handler.in"
   CACHE STRING
   "Specify the path to a header that contains a custom implementation of the
    assertion handler that gets invoked when a hardening assertion fails. If
    provided, this header will be included by the library, replacing the
-   default assertion handler.")
+   default assertion handler. If this is specified as a relative path, it
+   is assumed to be relative to '<monorepo>/libcxx'.")
+if (NOT IS_ABSOLUTE "${LIBCXX_ASSERTION_HANDLER_FILE}")
+  set(LIBCXX_ASSERTION_HANDLER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/${LIBCXX_ASSERTION_HANDLER_FILE}")
+endif()
 option(LIBCXX_ENABLE_RANDOM_DEVICE
   "Whether to include support for std::random_device in the library. Disabling
    this can be useful when building the library for platforms that don't have
diff --git a/libcxx/docs/BuildingLibcxx.rst b/libcxx/docs/BuildingLibcxx.rst
index 66bb19bb5b2cd..5c224689e0f9f 100644
--- a/libcxx/docs/BuildingLibcxx.rst
+++ b/libcxx/docs/BuildingLibcxx.rst
@@ -406,7 +406,8 @@ libc++ Feature Options
   Specify the path to a header that contains a custom implementation of the
   assertion handler that gets invoked when a hardening assertion fails. If
   provided, this header will be included by the library, replacing the
-  default assertion handler.
+  default assertion handler. If this is specified as a relative path, it
+  is assumed to be relative to ``<monorepo>/libcxx``.
 
 
 libc++ ABI Feature Options

>From 5898a7f4954e13cabc694a44e293d9040a09b9f4 Mon Sep 17 00:00:00 2001
From: Angel Zhang <angel.zhang at amd.com>
Date: Wed, 24 Jul 2024 12:08:39 -0400
Subject: [PATCH 039/259] [mlir][spirv] Update dependencies for #100138
 (#100397)

This PR updates CMake and Bazel dependencies for #100138.
---
 mlir/lib/Conversion/ConvertToSPIRV/CMakeLists.txt | 1 +
 mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/mlir/lib/Conversion/ConvertToSPIRV/CMakeLists.txt b/mlir/lib/Conversion/ConvertToSPIRV/CMakeLists.txt
index f7b090acf33af..c9d962d2de23f 100644
--- a/mlir/lib/Conversion/ConvertToSPIRV/CMakeLists.txt
+++ b/mlir/lib/Conversion/ConvertToSPIRV/CMakeLists.txt
@@ -27,6 +27,7 @@ add_mlir_conversion_library(MLIRConvertToSPIRVPass
   MLIRTransforms
   MLIRTransformUtils
   MLIRUBToSPIRV
+  MLIRVectorDialect
   MLIRVectorToSPIRV
   MLIRVectorTransforms
   )
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
index 4de9b4729e720..68e0206e30a59 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
@@ -20,6 +20,7 @@ add_mlir_dialect_library(MLIRSPIRVConversion
   MLIRDialectUtils
   MLIRFuncDialect
   MLIRIR
+  MLIRPass
   MLIRSPIRVDialect
   MLIRSupport
   MLIRTransformUtils

>From 5a53add85a6d5be2d15eea32a3d06fec35e2c699 Mon Sep 17 00:00:00 2001
From: Finlay <finlay.marno at codeplay.com>
Date: Wed, 24 Jul 2024 17:30:03 +0100
Subject: [PATCH 040/259] [mlir] Add optimization attrs for gpu-to-llvmspv
 function declarations and calls (#99301)

Adds the attributes nounwind and willreturn to all function
declarations. Adds `memory(none)` equivalent to the id/dimension
function declarations. The function declaration attributes are copied to
the function calls.
`nounwind` is legal because there are no exception in SPIR-V. I also do
not see any reason why any of these functions would not return when used
correctly.
I'm confident that the get id/dim functions will have no externally
observable memory effects, but think the convergent functions will have
effects.
---
 .../Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp  |  36 ++-
 .../GPUToLLVMSPV/gpu-to-llvm-spv.mlir         | 286 ++++++++++++++----
 2 files changed, 255 insertions(+), 67 deletions(-)

diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
index ebeb8f803d71d..27d63b5f8948d 100644
--- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
+++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
@@ -43,8 +43,8 @@ namespace mlir {
 static LLVM::LLVMFuncOp lookupOrCreateSPIRVFn(Operation *symbolTable,
                                               StringRef name,
                                               ArrayRef<Type> paramTypes,
-                                              Type resultType,
-                                              bool isConvergent = false) {
+                                              Type resultType, bool isMemNone,
+                                              bool isConvergent) {
   auto func = dyn_cast_or_null<LLVM::LLVMFuncOp>(
       SymbolTable::lookupSymbolIn(symbolTable, name));
   if (!func) {
@@ -53,6 +53,18 @@ static LLVM::LLVMFuncOp lookupOrCreateSPIRVFn(Operation *symbolTable,
         symbolTable->getLoc(), name,
         LLVM::LLVMFunctionType::get(resultType, paramTypes));
     func.setCConv(LLVM::cconv::CConv::SPIR_FUNC);
+    func.setNoUnwind(true);
+    func.setWillReturn(true);
+
+    if (isMemNone) {
+      // no externally observable effects
+      constexpr auto noModRef = mlir::LLVM::ModRefInfo::NoModRef;
+      auto memAttr = b.getAttr<LLVM::MemoryEffectsAttr>(
+          /*other=*/noModRef,
+          /*argMem=*/noModRef, /*inaccessibleMem=*/noModRef);
+      func.setMemoryEffectsAttr(memAttr);
+    }
+
     func.setConvergent(isConvergent);
   }
   return func;
@@ -64,6 +76,10 @@ static LLVM::CallOp createSPIRVBuiltinCall(Location loc,
                                            ValueRange args) {
   auto call = rewriter.create<LLVM::CallOp>(loc, func, args);
   call.setCConv(func.getCConv());
+  call.setConvergentAttr(func.getConvergentAttr());
+  call.setNoUnwindAttr(func.getNoUnwindAttr());
+  call.setWillReturnAttr(func.getWillReturnAttr());
+  call.setMemoryEffectsAttr(func.getMemoryEffectsAttr());
   return call;
 }
 
@@ -91,8 +107,9 @@ struct GPUBarrierConversion final : ConvertOpToLLVMPattern<gpu::BarrierOp> {
     assert(moduleOp && "Expecting module");
     Type flagTy = rewriter.getI32Type();
     Type voidTy = rewriter.getType<LLVM::LLVMVoidType>();
-    LLVM::LLVMFuncOp func = lookupOrCreateSPIRVFn(
-        moduleOp, funcName, flagTy, voidTy, /*isConvergent=*/true);
+    LLVM::LLVMFuncOp func =
+        lookupOrCreateSPIRVFn(moduleOp, funcName, flagTy, voidTy,
+                              /*isMemNone=*/false, /*isConvergent=*/true);
 
     // Value used by SPIR-V backend to represent `CLK_LOCAL_MEM_FENCE`.
     // See `llvm/lib/Target/SPIRV/SPIRVBuiltins.td`.
@@ -134,8 +151,9 @@ struct LaunchConfigConversion : ConvertToLLVMPattern {
     assert(moduleOp && "Expecting module");
     Type dimTy = rewriter.getI32Type();
     Type indexTy = getTypeConverter()->getIndexType();
-    LLVM::LLVMFuncOp func =
-        lookupOrCreateSPIRVFn(moduleOp, funcName, dimTy, indexTy);
+    LLVM::LLVMFuncOp func = lookupOrCreateSPIRVFn(moduleOp, funcName, dimTy,
+                                                  indexTy, /*isMemNone=*/true,
+                                                  /*isConvergent=*/false);
 
     Location loc = op->getLoc();
     gpu::Dimension dim = getDimension(op);
@@ -268,9 +286,9 @@ struct GPUShuffleConversion final : ConvertOpToLLVMPattern<gpu::ShuffleOp> {
     Type valueType = adaptor.getValue().getType();
     Type offsetType = adaptor.getOffset().getType();
     Type resultType = valueType;
-    LLVM::LLVMFuncOp func =
-        lookupOrCreateSPIRVFn(moduleOp, funcName, {valueType, offsetType},
-                              resultType, /*isConvergent=*/true);
+    LLVM::LLVMFuncOp func = lookupOrCreateSPIRVFn(
+        moduleOp, funcName, {valueType, offsetType}, resultType,
+        /*isMemNone=*/false, /*isConvergent=*/true);
 
     Location loc = op->getLoc();
     std::array<Value, 2> args{adaptor.getValue(), adaptor.getOffset()};
diff --git a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
index 1b0f89a9a573e..bd7e5d139b001 100644
--- a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
+++ b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
@@ -4,30 +4,70 @@
 // RUN: | FileCheck --check-prefixes=CHECK-32,CHECK %s
 
 gpu.module @builtins {
-  // CHECK-64:    llvm.func spir_funccc @_Z14get_num_groupsj(i32) -> i64
-  // CHECK-64:    llvm.func spir_funccc @_Z12get_local_idj(i32) -> i64
-  // CHECK-64:    llvm.func spir_funccc @_Z14get_local_sizej(i32) -> i64
-  // CHECK-64:    llvm.func spir_funccc @_Z13get_global_idj(i32) -> i64
-  // CHECK-64:    llvm.func spir_funccc @_Z12get_group_idj(i32) -> i64
-  // CHECK-32:    llvm.func spir_funccc @_Z14get_num_groupsj(i32) -> i32
-  // CHECK-32:    llvm.func spir_funccc @_Z12get_local_idj(i32) -> i32
-  // CHECK-32:    llvm.func spir_funccc @_Z14get_local_sizej(i32) -> i32
-  // CHECK-32:    llvm.func spir_funccc @_Z13get_global_idj(i32) -> i32
-  // CHECK-32:    llvm.func spir_funccc @_Z12get_group_idj(i32) -> i32
+  // CHECK-64:        llvm.func spir_funccc @_Z14get_num_groupsj(i32) -> i64 attributes {
+  // CHECK-32:        llvm.func spir_funccc @_Z14get_num_groupsj(i32) -> i32 attributes {
+  // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       convergent
+  // CHECK-SAME:      }
+  // CHECK-64:        llvm.func spir_funccc @_Z12get_local_idj(i32) -> i64 attributes {
+  // CHECK-32:        llvm.func spir_funccc @_Z12get_local_idj(i32) -> i32 attributes {
+  // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       convergent
+  // CHECK-SAME:      }
+  // CHECK-64:        llvm.func spir_funccc @_Z14get_local_sizej(i32) -> i64 attributes {
+  // CHECK-32:        llvm.func spir_funccc @_Z14get_local_sizej(i32) -> i32 attributes {
+  // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       convergent
+  // CHECK-SAME:      }
+  // CHECK-64:        llvm.func spir_funccc @_Z13get_global_idj(i32) -> i64 attributes {
+  // CHECK-32:        llvm.func spir_funccc @_Z13get_global_idj(i32) -> i32 attributes {
+  // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       convergent
+  // CHECK-SAME:      }
+  // CHECK-64:        llvm.func spir_funccc @_Z12get_group_idj(i32) -> i64 attributes {
+  // CHECK-32:        llvm.func spir_funccc @_Z12get_group_idj(i32) -> i32 attributes {
+  // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       convergent
+  // CHECK-SAME:      }
 
   // CHECK-LABEL: gpu_block_id
   func.func @gpu_block_id() -> (index, index, index) {
     // CHECK:         [[C0:%.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z12get_group_idj([[C0]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z12get_group_idj([[C0]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z12get_group_idj([[C0]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %block_id_x = gpu.block_id x
     // CHECK:         [[C1:%.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z12get_group_idj([[C1]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z12get_group_idj([[C1]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z12get_group_idj([[C1]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %block_id_y = gpu.block_id y
     // CHECK:         [[C2:%.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z12get_group_idj([[C2]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z12get_group_idj([[C2]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z12get_group_idj([[C2]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %block_id_z = gpu.block_id z
     return %block_id_x, %block_id_y, %block_id_z : index, index, index
   }
@@ -35,16 +75,31 @@ gpu.module @builtins {
   // CHECK-LABEL: gpu_global_id
   func.func @gpu_global_id() -> (index, index, index) {
     // CHECK:         [[C0:%.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z13get_global_idj([[C0]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z13get_global_idj([[C0]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z13get_global_idj([[C0]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %global_id_x = gpu.global_id x
     // CHECK:         [[C1:%.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z13get_global_idj([[C1]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z13get_global_idj([[C1]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z13get_global_idj([[C1]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %global_id_y = gpu.global_id y
     // CHECK:         [[C2:%.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z13get_global_idj([[C2]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z13get_global_idj([[C2]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z13get_global_idj([[C2]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %global_id_z = gpu.global_id z
     return %global_id_x, %global_id_y, %global_id_z : index, index, index
   }
@@ -52,16 +107,31 @@ gpu.module @builtins {
   // CHECK-LABEL: gpu_block_dim
   func.func @gpu_block_dim() -> (index, index, index) {
     // CHECK:         [[C0:%.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z14get_local_sizej([[C0]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z14get_local_sizej([[C0]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z14get_local_sizej([[C0]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %block_dim_x = gpu.block_dim x
     // CHECK:         [[C1:%.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z14get_local_sizej([[C1]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z14get_local_sizej([[C1]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z14get_local_sizej([[C1]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %block_dim_y = gpu.block_dim y
     // CHECK:         [[C2:%.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z14get_local_sizej([[C2]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z14get_local_sizej([[C2]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z14get_local_sizej([[C2]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %block_dim_z = gpu.block_dim z
     return %block_dim_x, %block_dim_y, %block_dim_z : index, index, index
   }
@@ -69,16 +139,31 @@ gpu.module @builtins {
   // CHECK-LABEL: gpu_thread_id
   func.func @gpu_thread_id() -> (index, index, index) {
     // CHECK:         [[C0:%.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z12get_local_idj([[C0]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z12get_local_idj([[C0]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z12get_local_idj([[C0]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %thread_id_x = gpu.thread_id x
     // CHECK:         [[C1:%.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z12get_local_idj([[C1]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z12get_local_idj([[C1]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z12get_local_idj([[C1]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %thread_id_y = gpu.thread_id y
     // CHECK:         [[C2:%.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z12get_local_idj([[C2]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z12get_local_idj([[C2]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z12get_local_idj([[C2]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %thread_id_z = gpu.thread_id z
     return %thread_id_x, %thread_id_y, %thread_id_z : index, index, index
   }
@@ -86,16 +171,31 @@ gpu.module @builtins {
   // CHECK-LABEL: gpu_grid_dim
   func.func @gpu_grid_dim() -> (index, index, index) {
     // CHECK:         [[C0:%.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z14get_num_groupsj([[C0]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z14get_num_groupsj([[C0]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z14get_num_groupsj([[C0]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %grid_dim_x = gpu.grid_dim x
     // CHECK:         [[C1:%.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z14get_num_groupsj([[C1]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z14get_num_groupsj([[C1]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z14get_num_groupsj([[C1]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %grid_dim_y = gpu.grid_dim y
     // CHECK:         [[C2:%.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK-64:      llvm.call spir_funccc @_Z14get_num_groupsj([[C2]]) : (i32) -> i64
-    // CHECK-32:      llvm.call spir_funccc @_Z14get_num_groupsj([[C2]]) : (i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z14get_num_groupsj([[C2]]) {
+    // CHECK-SAME-DAG:  memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       convergent
+    // CHECK-64-SAME: } : (i32) -> i64
+    // CHECK-32-SAME: } : (i32) -> i32
     %grid_dim_z = gpu.grid_dim z
     return %grid_dim_x, %grid_dim_y, %grid_dim_z : index, index, index
   }
@@ -104,12 +204,22 @@ gpu.module @builtins {
 // -----
 
 gpu.module @barriers {
-  // CHECK:       llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent}
+  // CHECK:           llvm.func spir_funccc @_Z7barrierj(i32) attributes {
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  convergent
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       memory_effects = #llvm.memory_effects
+  // CHECK-SAME:      }
 
   // CHECK-LABEL: gpu_barrier
   func.func @gpu_barrier() {
     // CHECK:         [[FLAGS:%.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK:         llvm.call spir_funccc @_Z7barrierj([[FLAGS]]) : (i32) -> ()
+    // CHECK:         llvm.call spir_funccc @_Z7barrierj([[FLAGS]]) {
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  convergent
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       memory_effects = #llvm.memory_effects
+    // CHECK-SAME:    } : (i32) -> ()
     gpu.barrier
     return
   }
@@ -120,10 +230,30 @@ gpu.module @barriers {
 // Check `gpu.shuffle` conversion with default subgroup size.
 
 gpu.module @shuffles {
-  // CHECK:       llvm.func spir_funccc @_Z22sub_group_shuffle_downdj(f64, i32) -> f64 attributes {convergent}
-  // CHECK:       llvm.func spir_funccc @_Z20sub_group_shuffle_upfj(f32, i32) -> f32 attributes {convergent}
-  // CHECK:       llvm.func spir_funccc @_Z21sub_group_shuffle_xorlj(i64, i32) -> i64 attributes {convergent}
-  // CHECK:       llvm.func spir_funccc @_Z17sub_group_shuffleij(i32, i32) -> i32 attributes {convergent}
+  // CHECK:           llvm.func spir_funccc @_Z22sub_group_shuffle_downdj(f64, i32) -> f64 attributes {
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  convergent
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       memory_effects = #llvm.memory_effects
+  // CHECK-SAME:      }
+  // CHECK:           llvm.func spir_funccc @_Z20sub_group_shuffle_upfj(f32, i32) -> f32 attributes {
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  convergent
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       memory_effects = #llvm.memory_effects
+  // CHECK-SAME:      }
+  // CHECK:           llvm.func spir_funccc @_Z21sub_group_shuffle_xorlj(i64, i32) -> i64 attributes {
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  convergent
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       memory_effects = #llvm.memory_effects
+  // CHECK-SAME:      }
+  // CHECK:           llvm.func spir_funccc @_Z17sub_group_shuffleij(i32, i32) -> i32 attributes {
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  convergent
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       memory_effects = #llvm.memory_effects
+  // CHECK-SAME:      }
 
   // CHECK-LABEL: gpu_shuffles
   // CHECK-SAME:              (%[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i32, %[[VAL_4:.*]]: f32, %[[VAL_5:.*]]: i32, %[[VAL_6:.*]]: f64, %[[VAL_7:.*]]: i32)
@@ -132,13 +262,33 @@ gpu.module @shuffles {
                           %val2: f32, %delta_up: i32,
                           %val3: f64, %delta_down: i32) {
     %width = arith.constant 32 : i32
-    // CHECK:         llvm.call spir_funccc @_Z17sub_group_shuffleij(%[[VAL_0]], %[[VAL_1]]) : (i32, i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z17sub_group_shuffleij(%[[VAL_0]], %[[VAL_1]]) {
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  convergent
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       memory_effects = #llvm.memory_effects
+    // CHECK-SAME:    } : (i32, i32) -> i32
     // CHECK:         llvm.mlir.constant(true) : i1
-    // CHECK:         llvm.call spir_funccc @_Z21sub_group_shuffle_xorlj(%[[VAL_2]], %[[VAL_3]]) : (i64, i32) -> i64
+    // CHECK:         llvm.call spir_funccc @_Z21sub_group_shuffle_xorlj(%[[VAL_2]], %[[VAL_3]]) {
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  convergent
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       memory_effects = #llvm.memory_effects
+    // CHECK-SAME:    } : (i64, i32) -> i64
     // CHECK:         llvm.mlir.constant(true) : i1
-    // CHECK:         llvm.call spir_funccc @_Z20sub_group_shuffle_upfj(%[[VAL_4]], %[[VAL_5]]) : (f32, i32) -> f32
+    // CHECK:         llvm.call spir_funccc @_Z20sub_group_shuffle_upfj(%[[VAL_4]], %[[VAL_5]]) {
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  convergent
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       memory_effects= #llvm.memory_effects
+    // CHECK-SAME:    } : (f32, i32) -> f32
     // CHECK:         llvm.mlir.constant(true) : i1
-    // CHECK:         llvm.call spir_funccc @_Z22sub_group_shuffle_downdj(%[[VAL_6]], %[[VAL_7]]) : (f64, i32) -> f64
+    // CHECK:         llvm.call spir_funccc @_Z22sub_group_shuffle_downdj(%[[VAL_6]], %[[VAL_7]]) {
+    // CHECK-SAME-DAG:  no_unwind
+    // CHECK-SAME-DAG:  convergent
+    // CHECK-SAME-DAG:  will_return
+    // CHECK-NOT:       memory_effects= #llvm.memory_effects
+    // CHECK-SAME:    } : (f64, i32) -> f64
     // CHECK:         llvm.mlir.constant(true) : i1
     %shuffleResult0, %valid0 = gpu.shuffle idx %val0, %id, %width : i32
     %shuffleResult1, %valid1 = gpu.shuffle xor %val1, %mask, %width : i64
@@ -155,10 +305,30 @@ gpu.module @shuffles {
 gpu.module @shuffles attributes {
   spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Kernel, Addresses, GroupNonUniformShuffle, Int64], []>, #spirv.resource_limits<subgroup_size = 16>>
 } {
-  // CHECK:       llvm.func spir_funccc @_Z22sub_group_shuffle_downdj(f64, i32) -> f64 attributes {convergent}
-  // CHECK:       llvm.func spir_funccc @_Z20sub_group_shuffle_upfj(f32, i32) -> f32 attributes {convergent}
-  // CHECK:       llvm.func spir_funccc @_Z21sub_group_shuffle_xorlj(i64, i32) -> i64 attributes {convergent}
-  // CHECK:       llvm.func spir_funccc @_Z17sub_group_shuffleij(i32, i32) -> i32 attributes {convergent}
+  // CHECK:           llvm.func spir_funccc @_Z22sub_group_shuffle_downdj(f64, i32) -> f64 attributes {
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  convergent
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       memory_effects = #llvm.memory_effects
+  // CHECK-SAME:      }
+  // CHECK:           llvm.func spir_funccc @_Z20sub_group_shuffle_upfj(f32, i32) -> f32 attributes {
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  convergent
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       memory_effects = #llvm.memory_effects
+  // CHECK-SAME:      }
+  // CHECK:           llvm.func spir_funccc @_Z21sub_group_shuffle_xorlj(i64, i32) -> i64 attributes {
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  convergent
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       memory_effects = #llvm.memory_effects
+  // CHECK-SAME:      }
+  // CHECK:           llvm.func spir_funccc @_Z17sub_group_shuffleij(i32, i32) -> i32 attributes {
+  // CHECK-SAME-DAG:  no_unwind
+  // CHECK-SAME-DAG:  convergent
+  // CHECK-SAME-DAG:  will_return
+  // CHECK-NOT:       memory_effects = #llvm.memory_effects
+  // CHECK-SAME:      }
 
   // CHECK-LABEL: gpu_shuffles
   // CHECK-SAME:              (%[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i32, %[[VAL_4:.*]]: f32, %[[VAL_5:.*]]: i32, %[[VAL_6:.*]]: f64, %[[VAL_7:.*]]: i32)
@@ -167,13 +337,13 @@ gpu.module @shuffles attributes {
                           %val2: f32, %delta_up: i32,
                           %val3: f64, %delta_down: i32) {
     %width = arith.constant 16 : i32
-    // CHECK:         llvm.call spir_funccc @_Z17sub_group_shuffleij(%[[VAL_0]], %[[VAL_1]]) : (i32, i32) -> i32
+    // CHECK:         llvm.call spir_funccc @_Z17sub_group_shuffleij(%[[VAL_0]], %[[VAL_1]])
     // CHECK:         llvm.mlir.constant(true) : i1
-    // CHECK:         llvm.call spir_funccc @_Z21sub_group_shuffle_xorlj(%[[VAL_2]], %[[VAL_3]]) : (i64, i32) -> i64
+    // CHECK:         llvm.call spir_funccc @_Z21sub_group_shuffle_xorlj(%[[VAL_2]], %[[VAL_3]])
     // CHECK:         llvm.mlir.constant(true) : i1
-    // CHECK:         llvm.call spir_funccc @_Z20sub_group_shuffle_upfj(%[[VAL_4]], %[[VAL_5]]) : (f32, i32) -> f32
+    // CHECK:         llvm.call spir_funccc @_Z20sub_group_shuffle_upfj(%[[VAL_4]], %[[VAL_5]])
     // CHECK:         llvm.mlir.constant(true) : i1
-    // CHECK:         llvm.call spir_funccc @_Z22sub_group_shuffle_downdj(%[[VAL_6]], %[[VAL_7]]) : (f64, i32) -> f64
+    // CHECK:         llvm.call spir_funccc @_Z22sub_group_shuffle_downdj(%[[VAL_6]], %[[VAL_7]])
     // CHECK:         llvm.mlir.constant(true) : i1
     %shuffleResult0, %valid0 = gpu.shuffle idx %val0, %id, %width : i32
     %shuffleResult1, %valid1 = gpu.shuffle xor %val1, %mask, %width : i64

>From 7102592af740ed9f8b388c12dd2182039a58167d Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Wed, 24 Jul 2024 09:35:09 -0700
Subject: [PATCH 041/259] [Offload] Repair and rename `llvm-omp-device-info`
 (to `-offload-`) (#100309)

The `llvm-omp-device-info` tool is very handy, but broke due to the lazy
evaluation of devices. This repairs the functionality and adds a test.
The tool is also renamed into `llvm-offload-device-info` as `-omp-` is
going away.
---
 offload/cmake/OpenMPTesting.cmake             |  12 +++
 offload/include/PluginManager.h               |  10 +-
 offload/src/PluginManager.cpp                 | 102 ++++++++++--------
 offload/src/interface.cpp                     |   2 +-
 offload/test/lit.cfg                          |   2 +
 offload/test/lit.site.cfg.in                  |   1 +
 offload/test/tools/llvm-omp-device-info.c     |   6 ++
 offload/tools/deviceinfo/CMakeLists.txt       |  10 +-
 ...-info.cpp => llvm-offload-device-info.cpp} |   9 +-
 9 files changed, 100 insertions(+), 54 deletions(-)
 create mode 100644 offload/test/tools/llvm-omp-device-info.c
 rename offload/tools/deviceinfo/{llvm-omp-device-info.cpp => llvm-offload-device-info.cpp} (73%)

diff --git a/offload/cmake/OpenMPTesting.cmake b/offload/cmake/OpenMPTesting.cmake
index 11eafeb764260..6609d6301d0f9 100644
--- a/offload/cmake/OpenMPTesting.cmake
+++ b/offload/cmake/OpenMPTesting.cmake
@@ -37,6 +37,17 @@ function(find_standalone_test_dependencies)
     return()
   endif()
 
+  find_program(OFFLOAD_DEVICE_INFO_EXECUTABLE
+    NAMES llvm-offload-device-info
+    PATHS ${OPENMP_LLVM_TOOLS_DIR})
+  if (NOT OFFLOAD_DEVICE_INFO_EXECUTABLE)
+    message(STATUS "Cannot find 'llvm-offload-device-info'.")
+    message(STATUS "Please put 'not' in your PATH, set OFFLOAD_DEVICE_INFO_EXECUTABLE to its full path, or point OPENMP_LLVM_TOOLS_DIR to its directory.")
+    message(WARNING "The check targets will not be available!")
+    set(ENABLE_CHECK_TARGETS FALSE PARENT_SCOPE)
+    return()
+  endif()
+
   find_program(OPENMP_NOT_EXECUTABLE
     NAMES not
     PATHS ${OPENMP_LLVM_TOOLS_DIR})
@@ -71,6 +82,7 @@ else()
     set(OPENMP_FILECHECK_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/FileCheck)
   endif()
   set(OPENMP_NOT_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/not)
+  set(OFFLOAD_DEVICE_INFO_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-offload-device-info)
 endif()
 
 # Macro to extract information about compiler from file. (no own scope)
diff --git a/offload/include/PluginManager.h b/offload/include/PluginManager.h
index fce2adca49c11..94b6d1090b5a8 100644
--- a/offload/include/PluginManager.h
+++ b/offload/include/PluginManager.h
@@ -113,8 +113,14 @@ struct PluginManager {
     return Devices.getExclusiveAccessor();
   }
 
-  // Initialize all plugins.
-  void initAllPlugins();
+  /// Initialize \p Plugin. Returns true on success.
+  bool initializePlugin(GenericPluginTy &Plugin);
+
+  /// Initialize device \p DeviceNo of \p Plugin. Returns true on success.
+  bool initializeDevice(GenericPluginTy &Plugin, int32_t DeviceId);
+
+  /// Eagerly initialize all plugins and their devices.
+  void initializeAllDevices();
 
   /// Iterator range for all plugins (in use or not, but always valid).
   auto plugins() { return llvm::make_pointee_range(Plugins); }
diff --git a/offload/src/PluginManager.cpp b/offload/src/PluginManager.cpp
index 5e8f91792a551..c6117782fbab6 100644
--- a/offload/src/PluginManager.cpp
+++ b/offload/src/PluginManager.cpp
@@ -13,6 +13,7 @@
 #include "PluginManager.h"
 #include "Shared/Debug.h"
 #include "Shared/Profile.h"
+#include "device.h"
 
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -60,15 +61,61 @@ void PluginManager::deinit() {
   DP("RTLs unloaded!\n");
 }
 
-void PluginManager::initAllPlugins() {
-  for (auto &R : plugins()) {
-    if (auto Err = R.init()) {
-      [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));
-      DP("Failed to init plugin: %s\n", InfoMsg.c_str());
+bool PluginManager::initializePlugin(GenericPluginTy &Plugin) {
+  if (Plugin.is_initialized())
+    return true;
+
+  if (auto Err = Plugin.init()) {
+    [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));
+    DP("Failed to init plugin: %s\n", InfoMsg.c_str());
+    return false;
+  }
+
+  DP("Registered plugin %s with %d visible device(s)\n", Plugin.getName(),
+     Plugin.number_of_devices());
+  return true;
+}
+
+bool PluginManager::initializeDevice(GenericPluginTy &Plugin,
+                                     int32_t DeviceId) {
+  if (Plugin.is_device_initialized(DeviceId))
+    return true;
+
+  // Initialize the device information for the RTL we are about to use.
+  auto ExclusiveDevicesAccessor = getExclusiveDevicesAccessor();
+
+  int32_t UserId = ExclusiveDevicesAccessor->size();
+
+  // Set the device identifier offset in the plugin.
+#ifdef OMPT_SUPPORT
+  Plugin.set_device_identifier(UserId, DeviceId);
+#endif
+
+  auto Device = std::make_unique<DeviceTy>(&Plugin, UserId, DeviceId);
+  if (auto Err = Device->init()) {
+    [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));
+    DP("Failed to init device %d: %s\n", DeviceId, InfoMsg.c_str());
+    return false;
+  }
+
+  ExclusiveDevicesAccessor->push_back(std::move(Device));
+
+  // We need to map between the plugin's device identifier and the one
+  // that OpenMP will use.
+  PM->DeviceIds[std::make_pair(&Plugin, DeviceId)] = UserId;
+
+  return true;
+}
+
+void PluginManager::initializeAllDevices() {
+  for (auto &Plugin : plugins()) {
+    if (!initializePlugin(Plugin))
       continue;
+
+    for (int32_t DeviceId = 0; DeviceId < Plugin.number_of_devices();
+         ++DeviceId) {
+      initializeDevice(Plugin, DeviceId);
     }
-    DP("Registered plugin %s with %d visible device(s)\n", R.getName(),
-       R.number_of_devices());
   }
 }
 
@@ -94,19 +141,12 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) {
 
     // Scan the RTLs that have associated images until we find one that supports
     // the current image.
-    for (auto &R : PM->plugins()) {
+    for (auto &R : plugins()) {
       if (!R.is_plugin_compatible(Img))
         continue;
 
-      if (!R.is_initialized()) {
-        if (auto Err = R.init()) {
-          [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));
-          DP("Failed to init plugin: %s\n", InfoMsg.c_str());
-          continue;
-        }
-        DP("Registered plugin %s with %d visible device(s)\n", R.getName(),
-           R.number_of_devices());
-      }
+      if (!initializePlugin(R))
+        continue;
 
       if (!R.number_of_devices()) {
         DP("Skipping plugin %s with no visible devices\n", R.getName());
@@ -120,30 +160,8 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) {
         DP("Image " DPxMOD " is compatible with RTL %s device %d!\n",
            DPxPTR(Img->ImageStart), R.getName(), DeviceId);
 
-        if (!R.is_device_initialized(DeviceId)) {
-          // Initialize the device information for the RTL we are about to use.
-          auto ExclusiveDevicesAccessor = getExclusiveDevicesAccessor();
-
-          int32_t UserId = ExclusiveDevicesAccessor->size();
-
-          // Set the device identifier offset in the plugin.
-#ifdef OMPT_SUPPORT
-          R.set_device_identifier(UserId, DeviceId);
-#endif
-
-          auto Device = std::make_unique<DeviceTy>(&R, UserId, DeviceId);
-          if (auto Err = Device->init()) {
-            [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));
-            DP("Failed to init device %d: %s\n", DeviceId, InfoMsg.c_str());
-            continue;
-          }
-
-          ExclusiveDevicesAccessor->push_back(std::move(Device));
-
-          // We need to map between the plugin's device identifier and the one
-          // that OpenMP will use.
-          PM->DeviceIds[std::make_pair(&R, DeviceId)] = UserId;
-        }
+        if (!initializeDevice(R, DeviceId))
+          continue;
 
         // Initialize (if necessary) translation table for this library.
         PM->TrlTblMtx.lock();
@@ -219,7 +237,7 @@ void PluginManager::unregisterLib(__tgt_bin_desc *Desc) {
 
     // Scan the RTLs that have associated images until we find one that supports
     // the current image. We only need to scan RTLs that are already being used.
-    for (auto &R : PM->plugins()) {
+    for (auto &R : plugins()) {
       if (R.is_initialized())
         continue;
 
diff --git a/offload/src/interface.cpp b/offload/src/interface.cpp
index 763b051cc6d77..b73e92f17ddb0 100644
--- a/offload/src/interface.cpp
+++ b/offload/src/interface.cpp
@@ -57,7 +57,7 @@ EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
 /// Initialize all available devices without registering any image
 EXTERN void __tgt_init_all_rtls() {
   assert(PM && "Runtime not initialized");
-  PM->initAllPlugins();
+  PM->initializeAllDevices();
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index 0479295c6cacc..6d6c77db80e16 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -408,3 +408,5 @@ config.substitutions.append(("%flags_clang", config.test_flags_clang))
 config.substitutions.append(("%flags_flang", config.test_flags_flang))
 config.substitutions.append(("%flags", config.test_flags))
 config.substitutions.append(("%not", config.libomptarget_not))
+config.substitutions.append(("%offload-device-info",
+                             config.offload_device_info))
diff --git a/offload/test/lit.site.cfg.in b/offload/test/lit.site.cfg.in
index 43751970cac27..62ada1d81721d 100644
--- a/offload/test/lit.site.cfg.in
+++ b/offload/test/lit.site.cfg.in
@@ -23,6 +23,7 @@ config.libomptarget_all_targets = "@LIBOMPTARGET_ALL_TARGETS@".split()
 config.libomptarget_current_target = "@CURRENT_TARGET@"
 config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
 config.libomptarget_not = "@OPENMP_NOT_EXECUTABLE@"
+config.offload_device_info = "@OFFLOAD_DEVICE_INFO_EXECUTABLE@"
 config.libomptarget_debug = @LIBOMPTARGET_DEBUG@
 config.has_libomptarget_ompt = @LIBOMPTARGET_OMPT_SUPPORT@
 config.libomptarget_has_libc = @LIBOMPTARGET_GPU_LIBC_SUPPORT@
diff --git a/offload/test/tools/llvm-omp-device-info.c b/offload/test/tools/llvm-omp-device-info.c
new file mode 100644
index 0000000000000..6f497309df2f2
--- /dev/null
+++ b/offload/test/tools/llvm-omp-device-info.c
@@ -0,0 +1,6 @@
+// RUN: %offload-device-info | %fcheck-generic
+//
+// Just check any device was found and something is printed
+//
+// CHECK: Found {{[1-9].*}} devices:
+// CHECK: Device 0:
diff --git a/offload/tools/deviceinfo/CMakeLists.txt b/offload/tools/deviceinfo/CMakeLists.txt
index a39de8833bf1d..3787c12f940a6 100644
--- a/offload/tools/deviceinfo/CMakeLists.txt
+++ b/offload/tools/deviceinfo/CMakeLists.txt
@@ -1,13 +1,13 @@
-message(STATUS "Building the llvm-omp-device-info tool")
+message(STATUS "Building the llvm-offload-device-info tool")
 
-add_openmp_tool(llvm-omp-device-info llvm-omp-device-info.cpp)
+add_openmp_tool(llvm-offload-device-info llvm-offload-device-info.cpp)
 
-llvm_update_compile_flags(llvm-omp-device-info)
+llvm_update_compile_flags(llvm-offload-device-info)
 
-target_include_directories(llvm-omp-device-info PRIVATE
+target_include_directories(llvm-offload-device-info PRIVATE
   ${LIBOMPTARGET_INCLUDE_DIR}
 )
-target_link_libraries(llvm-omp-device-info PRIVATE
+target_link_libraries(llvm-offload-device-info PRIVATE
   omp
   omptarget
 )
diff --git a/offload/tools/deviceinfo/llvm-omp-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
similarity index 73%
rename from offload/tools/deviceinfo/llvm-omp-device-info.cpp
rename to offload/tools/deviceinfo/llvm-offload-device-info.cpp
index 7dd22aec9828f..2228fbf3ec173 100644
--- a/offload/tools/deviceinfo/llvm-omp-device-info.cpp
+++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
@@ -1,4 +1,4 @@
-//===- llvm-omp-device-info.cpp - Obtain device info as seen from OpenMP --===//
+//===- llvm-offload-device-info.cpp - Device info as seen by LLVM/Offload -===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This is a command line utility that, by using Libomptarget, and the device
-// plugins, list devices information as seen from the OpenMP Runtime.
+// This is a command line utility that, by using LLVM/Offload, and the device
+// plugins, list devices information as seen by the runtime.
 //
 //===----------------------------------------------------------------------===//
 
@@ -19,8 +19,9 @@ int main(int argc, char **argv) {
   __tgt_register_lib(&EmptyDesc);
   __tgt_init_all_rtls();
 
+  printf("Found %d devices:\n", omp_get_num_devices());
   for (int Dev = 0; Dev < omp_get_num_devices(); Dev++) {
-    printf("Device (%d):\n", Dev);
+    printf("  Device %d:\n", Dev);
     if (!__tgt_print_device_info(Dev))
       printf("    print_device_info not implemented\n");
     printf("\n");

>From 83fb0643f7148499eb6ed19155ab69bfd8096ed7 Mon Sep 17 00:00:00 2001
From: memory-thrasher <39869500+memory-thrasher at users.noreply.github.com>
Date: Wed, 24 Jul 2024 09:38:47 -0700
Subject: [PATCH 042/259] Adds a pseudonym to clang's windows mangler...
 (#97792)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…to handle template argument values that are pointers one-past-the-end
of a non-array symbol. Also improves error messages in other template
argument scenarios where clang bails.

https://github.com/llvm/llvm-project/issues/97756

I don't think I hooked up the unit test right. I'm not sure one is
really needed for what boils down to a tweaked if statement. Please
advise.
---
 clang/lib/AST/MicrosoftMangle.cpp             | 217 +++++++-----------
 .../CodeGen/ms_mangler_templatearg_opte.cpp   |  19 ++
 .../aarch64-mangle-sve-vectors-msvc.cpp       |   2 +-
 3 files changed, 106 insertions(+), 132 deletions(-)
 create mode 100644 clang/test/CodeGen/ms_mangler_templatearg_opte.cpp

diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index 4016043df62ed..e0d7c01ca3351 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -337,6 +337,10 @@ class MicrosoftCXXNameMangler {
 
   const bool PointersAre64Bit;
 
+  DiagnosticBuilder Error(SourceLocation, StringRef, StringRef);
+  DiagnosticBuilder Error(SourceLocation, StringRef);
+  DiagnosticBuilder Error(StringRef);
+
 public:
   enum QualifierMangleMode { QMM_Drop, QMM_Mangle, QMM_Escape, QMM_Result };
   enum class TplArgKind { ClassNTTP, StructuralValue };
@@ -564,6 +568,31 @@ MicrosoftMangleContextImpl::shouldMangleStringLiteral(const StringLiteral *SL) {
   return true;
 }
 
+DiagnosticBuilder MicrosoftCXXNameMangler::Error(SourceLocation loc,
+                                                 StringRef thing1,
+                                                 StringRef thing2) {
+  DiagnosticsEngine &Diags = Context.getDiags();
+  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
+                                          "cannot mangle this %0 %1 yet");
+  return Diags.Report(loc, DiagID) << thing1 << thing2;
+}
+
+DiagnosticBuilder MicrosoftCXXNameMangler::Error(SourceLocation loc,
+                                                 StringRef thingy) {
+  DiagnosticsEngine &Diags = Context.getDiags();
+  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
+                                          "cannot mangle this %0 yet");
+  return Diags.Report(loc, DiagID) << thingy;
+}
+
+DiagnosticBuilder MicrosoftCXXNameMangler::Error(StringRef thingy) {
+  DiagnosticsEngine &Diags = Context.getDiags();
+  // extra placeholders are ignored quietly when not used
+  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
+                                          "cannot mangle this %0 yet");
+  return Diags.Report(DiagID) << thingy;
+}
+
 void MicrosoftCXXNameMangler::mangle(GlobalDecl GD, StringRef Prefix) {
   const NamedDecl *D = cast<NamedDecl>(GD.getDecl());
   // MSVC doesn't mangle C++ names the same way it mangles extern "C" names.
@@ -1578,10 +1607,7 @@ void MicrosoftCXXNameMangler::mangleOperatorName(OverloadedOperatorKind OO,
   case OO_Spaceship: Out << "?__M"; break;
 
   case OO_Conditional: {
-    DiagnosticsEngine &Diags = Context.getDiags();
-    unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-      "cannot mangle this conditional operator yet");
-    Diags.Report(Loc, DiagID);
+    Error(Loc, "conditional operator");
     break;
   }
 
@@ -1673,11 +1699,8 @@ void MicrosoftCXXNameMangler::mangleExpression(
   }
 
   // As bad as this diagnostic is, it's better than crashing.
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(
-      DiagnosticsEngine::Error, "cannot yet mangle expression type %0");
-  Diags.Report(E->getExprLoc(), DiagID) << E->getStmtClassName()
-                                        << E->getSourceRange();
+  Error(E->getExprLoc(), "expression type: ", E->getStmtClassName())
+      << E->getSourceRange();
 }
 
 void MicrosoftCXXNameMangler::mangleTemplateArgs(
@@ -1923,11 +1946,19 @@ void MicrosoftCXXNameMangler::mangleTemplateArgValue(QualType T,
     if (WithScalarType)
       mangleType(T, SourceRange(), QMM_Escape);
 
-    // We don't know how to mangle past-the-end pointers yet.
-    if (V.isLValueOnePastTheEnd())
-      break;
-
     APValue::LValueBase Base = V.getLValueBase();
+
+    // this might not cover every case but did cover issue 97756
+    // see test CodeGen/ms_mangler_templatearg_opte
+    if (V.isLValueOnePastTheEnd()) {
+      Out << "5E";
+      auto *VD = Base.dyn_cast<const ValueDecl *>();
+      if (VD)
+        mangle(VD);
+      Out << "@";
+      return;
+    }
+
     if (!V.hasLValuePath() || V.getLValuePath().empty()) {
       // Taking the address of a complete object has a special-case mangling.
       if (Base.isNull()) {
@@ -1939,12 +1970,14 @@ void MicrosoftCXXNameMangler::mangleTemplateArgValue(QualType T,
         mangleNumber(V.getLValueOffset().getQuantity());
       } else if (!V.hasLValuePath()) {
         // FIXME: This can only happen as an extension. Invent a mangling.
-        break;
+        Error("template argument (extension not comaptible with ms mangler)");
+        return;
       } else if (auto *VD = Base.dyn_cast<const ValueDecl*>()) {
         Out << "E";
         mangle(VD);
       } else {
-        break;
+        Error("template argument (undeclared base)");
+        return;
       }
     } else {
       if (TAK == TplArgKind::ClassNTTP && T->isPointerType())
@@ -1989,8 +2022,10 @@ void MicrosoftCXXNameMangler::mangleTemplateArgValue(QualType T,
         Out << *I;
 
       auto *VD = Base.dyn_cast<const ValueDecl*>();
-      if (!VD)
-        break;
+      if (!VD) {
+        Error("template argument (null value decl)");
+        return;
+      }
       Out << (TAK == TplArgKind::ClassNTTP ? 'E' : '1');
       mangle(VD);
 
@@ -2105,15 +2140,16 @@ void MicrosoftCXXNameMangler::mangleTemplateArgValue(QualType T,
     return;
   }
 
-  case APValue::AddrLabelDiff:
-  case APValue::FixedPoint:
-    break;
+  case APValue::AddrLabelDiff: {
+    Error("template argument (value type: address label diff)");
+    return;
   }
 
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(
-      DiagnosticsEngine::Error, "cannot mangle this template argument yet");
-  Diags.Report(DiagID);
+  case APValue::FixedPoint: {
+    Error("template argument (value type: fixed point)");
+    return;
+  }
+  }
 }
 
 void MicrosoftCXXNameMangler::mangleObjCProtocol(const ObjCProtocolDecl *PD) {
@@ -2740,11 +2776,9 @@ void MicrosoftCXXNameMangler::mangleType(const BuiltinType *T, Qualifiers,
   case BuiltinType::SatULongFract:
   case BuiltinType::Ibm128:
   case BuiltinType::Float128: {
-    DiagnosticsEngine &Diags = Context.getDiags();
-    unsigned DiagID = Diags.getCustomDiagID(
-        DiagnosticsEngine::Error, "cannot mangle this built-in %0 type yet");
-    Diags.Report(Range.getBegin(), DiagID)
-        << T->getName(Context.getASTContext().getPrintingPolicy()) << Range;
+    Error(Range.getBegin(), "built-in type: ",
+          T->getName(Context.getASTContext().getPrintingPolicy()))
+        << Range;
     break;
   }
   }
@@ -3062,10 +3096,7 @@ void MicrosoftCXXNameMangler::mangleCallingConvention(CallingConv CC,
       return;
   }
 
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(
-      DiagnosticsEngine::Error, "cannot mangle this calling convention yet");
-  Diags.Report(Range.getBegin(), DiagID) << Range;
+  Error(Range.getBegin(), "calling convention") << Range;
 }
 void MicrosoftCXXNameMangler::mangleCallingConvention(const FunctionType *T,
                                                       SourceRange Range) {
@@ -3086,11 +3117,7 @@ void MicrosoftCXXNameMangler::mangleType(const UnresolvedUsingType *T,
                                          Qualifiers, SourceRange Range) {
   // Probably should be mangled as a template instantiation; need to see what
   // VC does first.
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this unresolved dependent type yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "unresolved dependent type") << Range;
 }
 
 // <type>        ::= <union-type> | <struct-type> | <class-type> | <enum-type>
@@ -3197,11 +3224,8 @@ void MicrosoftCXXNameMangler::mangleArrayType(const ArrayType *T) {
       // The dependent expression has to be folded into a constant (TODO).
       const DependentSizedArrayType *DSAT =
         getASTContext().getAsDependentSizedArrayType(ElementTy);
-      DiagnosticsEngine &Diags = Context.getDiags();
-      unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-        "cannot mangle this dependent-length array yet");
-      Diags.Report(DSAT->getSizeExpr()->getExprLoc(), DiagID)
-        << DSAT->getBracketsRange();
+      Error(DSAT->getSizeExpr()->getExprLoc(), "dependent-length")
+          << DSAT->getBracketsRange();
       return;
     } else {
       break;
@@ -3241,20 +3265,12 @@ void MicrosoftCXXNameMangler::mangleType(const MemberPointerType *T,
 
 void MicrosoftCXXNameMangler::mangleType(const TemplateTypeParmType *T,
                                          Qualifiers, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this template type parameter type yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "template type parameter type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const SubstTemplateTypeParmPackType *T,
                                          Qualifiers, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this substituted parameter pack yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "substituted parameter pack") << Range;
 }
 
 // <type> ::= <pointer-type>
@@ -3405,46 +3421,27 @@ void MicrosoftCXXNameMangler::mangleType(const ExtVectorType *T,
 
 void MicrosoftCXXNameMangler::mangleType(const DependentVectorType *T,
                                          Qualifiers, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(
-      DiagnosticsEngine::Error,
-      "cannot mangle this dependent-sized vector type yet");
-  Diags.Report(Range.getBegin(), DiagID) << Range;
+  Error(Range.getBegin(), "dependent-sized vector type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const DependentSizedExtVectorType *T,
                                          Qualifiers, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this dependent-sized extended vector type yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "dependent-sized extended vector type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const ConstantMatrixType *T,
                                          Qualifiers quals, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-                                          "Cannot mangle this matrix type yet");
-  Diags.Report(Range.getBegin(), DiagID) << Range;
+  Error(Range.getBegin(), "matrix type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const DependentSizedMatrixType *T,
                                          Qualifiers quals, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(
-      DiagnosticsEngine::Error,
-      "Cannot mangle this dependent-sized matrix type yet");
-  Diags.Report(Range.getBegin(), DiagID) << Range;
+  Error(Range.getBegin(), "dependent-sized matrix type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const DependentAddressSpaceType *T,
                                          Qualifiers, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(
-      DiagnosticsEngine::Error,
-      "cannot mangle this dependent address space type yet");
-  Diags.Report(Range.getBegin(), DiagID) << Range;
+  Error(Range.getBegin(), "dependent address space type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const ObjCInterfaceType *T, Qualifiers,
@@ -3514,39 +3511,23 @@ void MicrosoftCXXNameMangler::mangleType(const InjectedClassNameType *,
 
 void MicrosoftCXXNameMangler::mangleType(const TemplateSpecializationType *T,
                                          Qualifiers, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this template specialization type yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "template specialization type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const DependentNameType *T, Qualifiers,
                                          SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this dependent name type yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "dependent name type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(
     const DependentTemplateSpecializationType *T, Qualifiers,
     SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this dependent template specialization type yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "dependent template specialization type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const PackExpansionType *T, Qualifiers,
                                          SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this pack expansion yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "pack expansion") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const PackIndexingType *T,
@@ -3557,60 +3538,37 @@ void MicrosoftCXXNameMangler::mangleType(const PackIndexingType *T,
 
 void MicrosoftCXXNameMangler::mangleType(const TypeOfType *T, Qualifiers,
                                          SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this typeof(type) yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "typeof(type)") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const TypeOfExprType *T, Qualifiers,
                                          SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this typeof(expression) yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "typeof(expression)") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const DecltypeType *T, Qualifiers,
                                          SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this decltype() yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "decltype()") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const UnaryTransformType *T,
                                          Qualifiers, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this unary transform type yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "unary transform type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const AutoType *T, Qualifiers,
                                          SourceRange Range) {
   assert(T->getDeducedType().isNull() && "expecting a dependent type!");
 
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this 'auto' type yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "'auto' type") << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(
     const DeducedTemplateSpecializationType *T, Qualifiers, SourceRange Range) {
   assert(T->getDeducedType().isNull() && "expecting a dependent type!");
 
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-    "cannot mangle this deduced class template specialization type yet");
-  Diags.Report(Range.getBegin(), DiagID)
-    << Range;
+  Error(Range.getBegin(), "deduced class template specialization type")
+      << Range;
 }
 
 void MicrosoftCXXNameMangler::mangleType(const AtomicType *T, Qualifiers,
@@ -3684,10 +3642,7 @@ void MicrosoftCXXNameMangler::mangleType(const BitIntType *T, Qualifiers,
 
 void MicrosoftCXXNameMangler::mangleType(const DependentBitIntType *T,
                                          Qualifiers, SourceRange Range) {
-  DiagnosticsEngine &Diags = Context.getDiags();
-  unsigned DiagID = Diags.getCustomDiagID(
-      DiagnosticsEngine::Error, "cannot mangle this DependentBitInt type yet");
-  Diags.Report(Range.getBegin(), DiagID) << Range;
+  Error(Range.getBegin(), "DependentBitInt type") << Range;
 }
 
 // <this-adjustment> ::= <no-adjustment> | <static-adjustment> |
diff --git a/clang/test/CodeGen/ms_mangler_templatearg_opte.cpp b/clang/test/CodeGen/ms_mangler_templatearg_opte.cpp
new file mode 100644
index 0000000000000..b17d97bb04cad
--- /dev/null
+++ b/clang/test/CodeGen/ms_mangler_templatearg_opte.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -emit-llvm -std=c++20 -x c++ < %s | FileCheck -check-prefix=WIN64 %s
+
+struct A {
+  const int* ptr;
+};
+
+template<A> void tfn() {};
+
+// WIN64: ??$tfn@$2UA@@PEBH5CE?ints@@3QBHB06@@@@@YAXXZ
+constexpr int ints[] = { 1, 2, 7, 8, 9, -17, -10 };
+
+// WIN64: ??$tfn@$2UA@@PEBH5E?one_int@@3HB@@@@YAXXZ
+constexpr int one_int = 7;
+
+void template_instance() {
+  tfn<A{ints + sizeof(ints)/sizeof(int)}>();
+  tfn<A{&one_int + 1}>();
+}
+
diff --git a/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors-msvc.cpp b/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors-msvc.cpp
index 55b87da122b04..435feec774ece 100644
--- a/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors-msvc.cpp
+++ b/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors-msvc.cpp
@@ -3,5 +3,5 @@
 
 template<typename T> struct S {};
 
-// CHECK: cannot mangle this built-in __SVInt8_t type yet
+// CHECK: cannot mangle this built-in type: __SVInt8_t yet
 void f1(S<__SVInt8_t>) {}

>From 0c274d527ae36db420c4273f83fcb6c859b28f1b Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Wed, 24 Jul 2024 09:33:51 -0700
Subject: [PATCH 043/259] [RISCV][TTI] Add coverage of odd sized vector loads
 and stores

---
 .../CostModel/RISCV/rvv-load-store.ll         | 99 +++++++++++++++++++
 1 file changed, 99 insertions(+)

diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
index d24a29dab03d9..af4ca7cc958c4 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
@@ -493,3 +493,102 @@ define void @store_of_constant(ptr %p) {
 
   ret void
 }
+
+
+define void @load_oddsize_vectors(ptr %p) {
+; CHECK-LABEL: 'load_oddsize_vectors'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load <1 x i32>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load <2 x i32>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %3 = load <3 x i32>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i32>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = load <5 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %6 = load <6 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = load <7 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = load <8 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = load <9 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = load <15 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = load <16 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %12 = load <31 x i32>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %13 = load <32 x i32>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CODESIZE-LABEL: 'load_oddsize_vectors'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load <1 x i32>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load <2 x i32>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load <3 x i32>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i32>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = load <5 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %6 = load <6 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = load <7 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = load <8 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = load <9 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = load <15 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = load <16 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %12 = load <31 x i32>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %13 = load <32 x i32>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+
+  load <1 x i32>, ptr %p
+  load <2 x i32>, ptr %p
+  load <3 x i32>, ptr %p
+  load <4 x i32>, ptr %p
+  load <5 x i32>, ptr %p
+  load <6 x i32>, ptr %p
+  load <7 x i32>, ptr %p
+  load <8 x i32>, ptr %p
+  load <9 x i32>, ptr %p
+  load <15 x i32>, ptr %p
+  load <16 x i32>, ptr %p
+  load <31 x i32>, ptr %p
+  load <32 x i32>, ptr %p
+
+  ret void
+}
+
+define void @store_oddsize_vectors(ptr %p) {
+; CHECK-LABEL: 'store_oddsize_vectors'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i32> undef, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <3 x i32> undef, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <5 x i32> undef, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <6 x i32> undef, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <7 x i32> undef, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i32> undef, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <15 x i32> undef, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <16 x i32> undef, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <31 x i32> undef, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <32 x i32> undef, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CODESIZE-LABEL: 'store_oddsize_vectors'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i32> undef, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i32> undef, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <5 x i32> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <6 x i32> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <7 x i32> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i32> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <15 x i32> undef, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <16 x i32> undef, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <31 x i32> undef, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <32 x i32> undef, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  store <1 x i32> undef, ptr %p
+  store <2 x i32> undef, ptr %p
+  store <3 x i32> undef, ptr %p
+  store <4 x i32> undef, ptr %p
+  store <5 x i32> undef, ptr %p
+  store <6 x i32> undef, ptr %p
+  store <7 x i32> undef, ptr %p
+  store <8 x i32> undef, ptr %p
+  store <15 x i32> undef, ptr %p
+  store <16 x i32> undef, ptr %p
+  store <31 x i32> undef, ptr %p
+  store <32 x i32> undef, ptr %p
+
+  ret void
+}

>From cdc193459d90fad83d2eafaccfe03368a9a8a160 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron at aaronballman.com>
Date: Wed, 24 Jul 2024 13:02:33 -0400
Subject: [PATCH 044/259] Update the backporting docs (#100401)

The documentation implies the special commands only work with issues,
but they also work directly from a pull request.
---
 llvm/docs/GitHub.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst
index 4010958c5d313..14424f722abf6 100644
--- a/llvm/docs/GitHub.rst
+++ b/llvm/docs/GitHub.rst
@@ -376,10 +376,10 @@ Releases
 
 Backporting Fixes to the Release Branches
 -----------------------------------------
-You can use special comments on issues to make backport requests for the
-release branches.  This is done by making a comment containing the following
-command on any issue that has been added to one of the "X.Y.Z Release"
-milestones.
+You can use special comments on issues or pull requests to make backport
+requests for the release branches.  This is done by making a comment containing
+the following command on any issue or pull request that has been added to one
+of the "X.Y.Z Release" milestones.
 
 ::
 
@@ -388,8 +388,8 @@ milestones.
 This command takes one or more git commit hashes as arguments and will attempt
 to cherry-pick the commit(s) to the release branch.  If the commit(s) fail to
 apply cleanly, then a comment with a link to the failing job will be added to
-the issue.  If the commit(s) do apply cleanly, then a pull request will
-be created with the specified commits.
+the issue/pull request.  If the commit(s) do apply cleanly, then a pull request
+will be created with the specified commits.
 
 If a commit you want to backport does not apply cleanly, you may resolve
 the conflicts locally and then create a pull request against the release

>From 8fd9624cf729dd722a170a9dfd8f725966515231 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Wed, 24 Jul 2024 10:02:21 -0700
Subject: [PATCH 045/259] [RISCV][CostModel] Test default lowering strategy for
 vector LD/ST

Remove two options from the test command line:

* -riscv-v-vector-bits-min=128 -- This has no effect as v implied zvl128b,
  which provides the same minimum.
* -riscv-v-fixed-length-vector-lmul-max=1 -- This one forced the backend
  to split as if LMUL>m1 were illegal.  This diverges significantly from
  default behavior (where all LMUL are legal).  Note that we do still
  have splitting test coverage after this change.  Look at e.g. 32 x i64.
---
 .../CostModel/RISCV/rvv-load-store.ll         | 120 +++++++++---------
 1 file changed, 60 insertions(+), 60 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
index af4ca7cc958c4..4e92937739d9b 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 < %s | FileCheck %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 < %s | FileCheck %s --check-prefixes=CODESIZE
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh < %s | FileCheck %s --check-prefixes=CODESIZE
 ; Check that we don't crash querying costs when vectors are not enabled.
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64
 
@@ -80,7 +80,7 @@ define void @load(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i8>, ptr %p, align 4
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load <8 x i8>, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load <16 x i8>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = load <32 x i8>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load <32 x i8>, ptr %p, align 32
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <vscale x 1 x i8>, ptr %p, align 1
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <vscale x 2 x i8>, ptr %p, align 2
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <vscale x 4 x i8>, ptr %p, align 4
@@ -92,8 +92,8 @@ define void @load(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <2 x i16>, ptr %p, align 4
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <4 x i16>, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = load <8 x i16>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = load <16 x i16>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %20 = load <32 x i16>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <16 x i16>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = load <32 x i16>, ptr %p, align 64
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <vscale x 1 x i16>, ptr %p, align 2
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <vscale x 2 x i16>, ptr %p, align 4
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <vscale x 4 x i16>, ptr %p, align 8
@@ -104,9 +104,9 @@ define void @load(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = load <1 x i32>, ptr %p, align 4
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = load <2 x i32>, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = load <4 x i32>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %31 = load <8 x i32>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %32 = load <16 x i32>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %33 = load <32 x i32>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = load <8 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = load <16 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = load <32 x i32>, ptr %p, align 128
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = load <vscale x 1 x i32>, ptr %p, align 4
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = load <vscale x 2 x i32>, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = load <vscale x 4 x i32>, ptr %p, align 16
@@ -116,10 +116,10 @@ define void @load(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %40 = load i64, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = load <1 x i64>, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %42 = load <2 x i64>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %43 = load <4 x i64>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %44 = load <8 x i64>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %45 = load <16 x i64>, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %46 = load <32 x i64>, ptr %p, align 256
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %43 = load <4 x i64>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %44 = load <8 x i64>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %45 = load <16 x i64>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %46 = load <32 x i64>, ptr %p, align 256
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %47 = load <vscale x 1 x i64>, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %48 = load <vscale x 2 x i64>, ptr %p, align 16
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %49 = load <vscale x 4 x i64>, ptr %p, align 32
@@ -129,10 +129,10 @@ define void @load(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %53 = load ptr, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %54 = load <1 x ptr>, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %55 = load <2 x ptr>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %56 = load <4 x ptr>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %57 = load <8 x ptr>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %58 = load <16 x ptr>, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %59 = load <32 x ptr>, ptr %p, align 256
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %56 = load <4 x ptr>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %57 = load <8 x ptr>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %58 = load <16 x ptr>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %59 = load <32 x ptr>, ptr %p, align 256
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %60 = load <vscale x 1 x ptr>, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %61 = load <vscale x 2 x ptr>, ptr %p, align 16
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %62 = load <vscale x 4 x ptr>, ptr %p, align 32
@@ -291,7 +291,7 @@ define void @store(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> undef, ptr %p, align 4
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i8> undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <32 x i8> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i8> undef, ptr %p, align 32
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 1 x i8> undef, ptr %p, align 1
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 2 x i8> undef, ptr %p, align 2
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 4 x i8> undef, ptr %p, align 4
@@ -303,8 +303,8 @@ define void @store(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i16> undef, ptr %p, align 4
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <16 x i16> undef, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <32 x i16> undef, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i16> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i16> undef, ptr %p, align 64
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 1 x i16> undef, ptr %p, align 2
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 2 x i16> undef, ptr %p, align 4
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 4 x i16> undef, ptr %p, align 8
@@ -315,9 +315,9 @@ define void @store(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i32> undef, ptr %p, align 4
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i32> undef, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <16 x i32> undef, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <32 x i32> undef, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i32> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i32> undef, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i32> undef, ptr %p, align 128
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 1 x i32> undef, ptr %p, align 4
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 2 x i32> undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 4 x i32> undef, ptr %p, align 16
@@ -327,10 +327,10 @@ define void @store(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i64 undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i64> undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i64> undef, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <8 x i64> undef, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <16 x i64> undef, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <32 x i64> undef, ptr %p, align 256
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i64> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i64> undef, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i64> undef, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <32 x i64> undef, ptr %p, align 256
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 1 x i64> undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 2 x i64> undef, ptr %p, align 16
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 4 x i64> undef, ptr %p, align 32
@@ -340,10 +340,10 @@ define void @store(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store ptr undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x ptr> undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x ptr> undef, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x ptr> undef, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <8 x ptr> undef, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <16 x ptr> undef, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <32 x ptr> undef, ptr %p, align 256
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x ptr> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x ptr> undef, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x ptr> undef, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <32 x ptr> undef, ptr %p, align 256
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 1 x ptr> undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 2 x ptr> undef, ptr %p, align 16
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 4 x ptr> undef, ptr %p, align 32
@@ -451,9 +451,9 @@ define void @store_of_constant(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> poison, ptr %p, align 16
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr %p, align 16
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> zeroinitializer, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i64> zeroinitializer, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i64> zeroinitializer, ptr %p, align 32
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: store <4 x i64> <i64 1, i64 1, i64 1, i64 1>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i64> <i64 1, i64 1, i64 1, i64 1>, ptr %p, align 32
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i32> <i32 4096, i32 4096, i32 4096, i32 4096>, ptr %p, align 16
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> <i32 1, i32 1, i32 2, i32 1>, ptr %p, align 16
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> <i32 2, i32 1, i32 1, i32 1>, ptr %p, align 16
@@ -501,14 +501,14 @@ define void @load_oddsize_vectors(ptr %p) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load <2 x i32>, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %3 = load <3 x i32>, ptr %p, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i32>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = load <5 x i32>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %6 = load <6 x i32>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = load <7 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %5 = load <5 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %6 = load <6 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %7 = load <7 x i32>, ptr %p, align 32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = load <8 x i32>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = load <9 x i32>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = load <15 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %9 = load <9 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %10 = load <15 x i32>, ptr %p, align 64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = load <16 x i32>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %12 = load <31 x i32>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %12 = load <31 x i32>, ptr %p, align 128
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %13 = load <32 x i32>, ptr %p, align 128
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -517,15 +517,15 @@ define void @load_oddsize_vectors(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load <2 x i32>, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load <3 x i32>, ptr %p, align 16
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i32>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = load <5 x i32>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %6 = load <6 x i32>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = load <7 x i32>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = load <8 x i32>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = load <9 x i32>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = load <15 x i32>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = load <16 x i32>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %12 = load <31 x i32>, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %13 = load <32 x i32>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load <5 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load <6 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load <7 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <8 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <9 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <15 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <16 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <31 x i32>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = load <32 x i32>, ptr %p, align 128
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 
@@ -552,13 +552,13 @@ define void @store_oddsize_vectors(ptr %p) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <3 x i32> undef, ptr %p, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <5 x i32> undef, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <6 x i32> undef, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <7 x i32> undef, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: store <5 x i32> undef, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: store <6 x i32> undef, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <7 x i32> undef, ptr %p, align 32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i32> undef, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <15 x i32> undef, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: store <15 x i32> undef, ptr %p, align 64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <16 x i32> undef, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <31 x i32> undef, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: store <31 x i32> undef, ptr %p, align 128
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <32 x i32> undef, ptr %p, align 128
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -567,14 +567,14 @@ define void @store_oddsize_vectors(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i32> undef, ptr %p, align 16
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <5 x i32> undef, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <6 x i32> undef, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <7 x i32> undef, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i32> undef, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <15 x i32> undef, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <16 x i32> undef, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <31 x i32> undef, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <32 x i32> undef, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <5 x i32> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <6 x i32> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <7 x i32> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i32> undef, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <15 x i32> undef, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i32> undef, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <31 x i32> undef, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i32> undef, ptr %p, align 128
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   store <1 x i32> undef, ptr %p

>From bb0300cf7ce54bfbb1607348c89cb0525e12076b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Wed, 24 Jul 2024 17:56:39 +0200
Subject: [PATCH 046/259] [clang][Interp] Fix initializing array subobjects
 with This pointers

We need to select the right array element once we see the CXXThisExpr.
---
 clang/lib/AST/Interp/Compiler.cpp |  5 +++++
 clang/lib/AST/Interp/Compiler.h   |  6 ++++++
 clang/test/AST/Interp/records.cpp | 19 +++++++++++++++++++
 3 files changed, 30 insertions(+)

diff --git a/clang/lib/AST/Interp/Compiler.cpp b/clang/lib/AST/Interp/Compiler.cpp
index 58a0852aca755..4d5d725d6964c 100644
--- a/clang/lib/AST/Interp/Compiler.cpp
+++ b/clang/lib/AST/Interp/Compiler.cpp
@@ -89,6 +89,10 @@ bool InitLink::emit(Compiler<Emitter> *Ctx, const Expr *E) const {
     return Ctx->emitGetPtrLocal(Offset, E);
   case K_Decl:
     return Ctx->visitDeclRef(D, E);
+  case K_Elem:
+    if (!Ctx->emitConstUint32(Offset, E))
+      return false;
+    return Ctx->emitArrayElemPtrUint32(E);
   default:
     llvm_unreachable("Unhandled InitLink kind");
   }
@@ -1556,6 +1560,7 @@ bool Compiler<Emitter>::visitArrayElemInit(unsigned ElemIndex,
     return this->emitInitElem(*T, ElemIndex, Init);
   }
 
+  InitLinkScope<Emitter> ILS(this, InitLink::Elem(ElemIndex));
   // Advance the pointer currently on the stack to the given
   // dimension.
   if (!this->emitConstUint32(ElemIndex, Init))
diff --git a/clang/lib/AST/Interp/Compiler.h b/clang/lib/AST/Interp/Compiler.h
index 084f5aef25f8e..d22b29d29a92d 100644
--- a/clang/lib/AST/Interp/Compiler.h
+++ b/clang/lib/AST/Interp/Compiler.h
@@ -50,6 +50,7 @@ struct InitLink {
     K_Field = 1,
     K_Temp = 2,
     K_Decl = 3,
+    K_Elem = 5,
   };
 
   static InitLink This() { return InitLink{K_This}; }
@@ -68,6 +69,11 @@ struct InitLink {
     IL.D = D;
     return IL;
   }
+  static InitLink Elem(unsigned Index) {
+    InitLink IL{K_Elem};
+    IL.Offset = Index;
+    return IL;
+  }
 
   InitLink(uint8_t Kind) : Kind(Kind) {}
   template <class Emitter>
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index 2fc88a0b1df6a..d77e5a5c782ce 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -1537,3 +1537,22 @@ namespace BitSet {
     Bitset()
   };
 }
+
+namespace ArrayInitChain {
+  struct StringLiteral {
+    const char *S;
+  };
+
+  struct CustomOperandVal {
+    StringLiteral Str;
+    unsigned Width;
+    unsigned Mask = Width + 1;
+  };
+
+  constexpr CustomOperandVal A[] = {
+    {},
+  };
+  static_assert(A[0].Str.S == nullptr, "");
+  static_assert(A[0].Width == 0, "");
+  static_assert(A[0].Mask == 1, "");
+}

>From ea222be0d9266d9d5c100496f8c9606f213454ee Mon Sep 17 00:00:00 2001
From: Sam Elliott <quic_aelliott at quicinc.com>
Date: Wed, 24 Jul 2024 18:24:39 +0100
Subject: [PATCH 047/259] [MC] Honour alignment directive fill value for
 non-intel (#100136)

As reported in https://llvm.org/PR30955, `.balign` with a fill-value of 0 did
not actually align using zeroes, on non-x86 targets.

This is because the check of whether to use the code alignment routines
or whether to just use the fill value was checking whether the fill
value was equal to `TextAlignFillValue`, which has not been changed from
its default of 0 on most targets (it has been changed for x86). However,
most targets do not set the fill value because it doesn't entirely make
sense -- i.e. on AArch64 there's no reasonable byte value to use for
alignment, as instructions are word-sized and have to be well-aligned.

I think the check at the end `AsmParser::parseDirectiveAlign` is
suspicious even on x86 - if you use `.balign <align>, 0x90` in a code
section, you don't end up with a block of `0x90` repeated, you end up
with a block of NOPs of various widths. This functionality is never
tested.

The fix here is to modify the check to ignore the default text align
fill value when choosing to do code alignment or not.

Fixes #30303
---
 lld/test/COFF/lto-cpu-string.ll             |  2 +-
 lld/test/ELF/lto/cpu-string.ll              |  2 +-
 lld/test/ELF/lto/mllvm.ll                   |  2 +-
 lld/test/MachO/lto-cpu-string.ll            |  2 +-
 llvm/docs/ReleaseNotes.rst                  | 19 ++++++++++
 llvm/lib/MC/MCParser/AsmParser.cpp          | 28 +++++++--------
 llvm/test/MC/AArch64/align-fill-byte-zero.s | 22 ++++++++++++
 llvm/test/MC/ARM/align-fill-byte-zero.s     | 40 +++++++++++++++++++++
 llvm/test/MC/AsmParser/directive_align.s    |  2 ++
 llvm/test/MC/COFF/align-nops.s              |  2 +-
 llvm/test/MC/ELF/align-nops.s               |  4 +--
 llvm/test/MC/MachO/x86_32-optimal_nop.s     | 30 ++++++++--------
 llvm/test/MC/RISCV/align-fill-byte-zero.s   | 29 +++++++++++++++
 llvm/test/MC/X86/code16gcc-align.s          |  6 ++--
 14 files changed, 149 insertions(+), 41 deletions(-)
 create mode 100644 llvm/test/MC/AArch64/align-fill-byte-zero.s
 create mode 100644 llvm/test/MC/ARM/align-fill-byte-zero.s
 create mode 100644 llvm/test/MC/RISCV/align-fill-byte-zero.s

diff --git a/lld/test/COFF/lto-cpu-string.ll b/lld/test/COFF/lto-cpu-string.ll
index c2a8df5f75bad..0f233b32bd43a 100644
--- a/lld/test/COFF/lto-cpu-string.ll
+++ b/lld/test/COFF/lto-cpu-string.ll
@@ -14,7 +14,7 @@ target triple = "x86_64-pc-windows-msvc19.14.26433"
 
 define dllexport void @foo() #0 {
 entry:
-  call void asm sideeffect ".p2align        4, 0x90", "~{dirflag},~{fpsr},~{flags}"()
+  call void asm sideeffect ".p2align        4", "~{dirflag},~{fpsr},~{flags}"()
   ret void
 }
 
diff --git a/lld/test/ELF/lto/cpu-string.ll b/lld/test/ELF/lto/cpu-string.ll
index 3697cbd6d9472..5444339007b0d 100644
--- a/lld/test/ELF/lto/cpu-string.ll
+++ b/lld/test/ELF/lto/cpu-string.ll
@@ -18,7 +18,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define void @foo() #0 {
 entry:
-  call void asm sideeffect ".p2align        4, 0x90", "~{dirflag},~{fpsr},~{flags}"()
+  call void asm sideeffect ".p2align        4", "~{dirflag},~{fpsr},~{flags}"()
   ret void
 }
 
diff --git a/lld/test/ELF/lto/mllvm.ll b/lld/test/ELF/lto/mllvm.ll
index 883a9c8d8dc75..d09dcd0bf7b95 100644
--- a/lld/test/ELF/lto/mllvm.ll
+++ b/lld/test/ELF/lto/mllvm.ll
@@ -17,7 +17,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 
 define void @_start() #0 {
 entry:
-  call void asm sideeffect ".p2align 4, 0x90", "~{dirflag},~{fpsr},~{flags}"()
+  call void asm sideeffect ".p2align 4", "~{dirflag},~{fpsr},~{flags}"()
   ret void
 }
 
diff --git a/lld/test/MachO/lto-cpu-string.ll b/lld/test/MachO/lto-cpu-string.ll
index dcd51bef6f446..3736a2e1f0fa8 100644
--- a/lld/test/MachO/lto-cpu-string.ll
+++ b/lld/test/MachO/lto-cpu-string.ll
@@ -16,7 +16,7 @@ target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 
 define void @foo() #0 {
 entry:
-  call void asm sideeffect ".p2align        4, 0x90", "~{dirflag},~{fpsr},~{flags}"()
+  call void asm sideeffect ".p2align        4", "~{dirflag},~{fpsr},~{flags}"()
   ret void
 }
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 46fa2a74450e1..c6a4a74a220af 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -65,12 +65,20 @@ Changes to Interprocedural Optimizations
 Changes to the AArch64 Backend
 ------------------------------
 
+* `.balign N, 0`, `.p2align N, 0`, `.align N, 0` in code sections will now fill
+  the required alignment space with a sequence of `0x0` bytes (the requested
+  fill value) rather than NOPs.
+
 Changes to the AMDGPU Backend
 -----------------------------
 
 Changes to the ARM Backend
 --------------------------
 
+* `.balign N, 0`, `.p2align N, 0`, `.align N, 0` in code sections will now fill
+  the required alignment space with a sequence of `0x0` bytes (the requested
+  fill value) rather than NOPs.
+
 Changes to the AVR Backend
 --------------------------
 
@@ -92,6 +100,10 @@ Changes to the PowerPC Backend
 Changes to the RISC-V Backend
 -----------------------------
 
+* `.balign N, 0`, `.p2align N, 0`, `.align N, 0` in code sections will now fill
+  the required alignment space with a sequence of `0x0` bytes (the requested
+  fill value) rather than NOPs.
+
 Changes to the WebAssembly Backend
 ----------------------------------
 
@@ -101,6 +113,13 @@ Changes to the Windows Target
 Changes to the X86 Backend
 --------------------------
 
+* `.balign N, 0x90`, `.p2align N, 0x90`, and `.align N, 0x90` in code sections
+  now fill the required alignment space with repeating `0x90` bytes, rather than
+  using optimised NOP filling. Optimised NOP filling fills the space with NOP
+  instructions of various widths, not just those that use the `0x90` byte
+  encoding. To use optimised NOP filling in a code section, leave off the
+  "fillval" argument, i.e. `.balign N`, `.p2align N` or `.align N` respectively.
+
 Changes to the OCaml bindings
 -----------------------------
 
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 992b69f1c5f32..1b9c624136e8b 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -3462,17 +3462,6 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
     }
   }
 
-  if (HasFillExpr && FillExpr != 0) {
-    MCSection *Sec = getStreamer().getCurrentSectionOnly();
-    if (Sec && Sec->isVirtualSection()) {
-      ReturnVal |=
-          Warning(FillExprLoc, "ignoring non-zero fill value in " +
-                                   Sec->getVirtualSectionKind() + " section '" +
-                                   Sec->getName() + "'");
-      FillExpr = 0;
-    }
-  }
-
   // Diagnose non-sensical max bytes to align.
   if (MaxBytesLoc.isValid()) {
     if (MaxBytesToFill < 1) {
@@ -3489,13 +3478,20 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
     }
   }
 
-  // Check whether we should use optimal code alignment for this .align
-  // directive.
   const MCSection *Section = getStreamer().getCurrentSectionOnly();
   assert(Section && "must have section to emit alignment");
-  bool useCodeAlign = Section->useCodeAlign();
-  if ((!HasFillExpr || Lexer.getMAI().getTextAlignFillValue() == FillExpr) &&
-      ValueSize == 1 && useCodeAlign) {
+
+  if (HasFillExpr && FillExpr != 0 && Section->isVirtualSection()) {
+    ReturnVal |=
+        Warning(FillExprLoc, "ignoring non-zero fill value in " +
+                                 Section->getVirtualSectionKind() +
+                                 " section '" + Section->getName() + "'");
+    FillExpr = 0;
+  }
+
+  // Check whether we should use optimal code alignment for this .align
+  // directive.
+  if (Section->useCodeAlign() && !HasFillExpr) {
     getStreamer().emitCodeAlignment(
         Align(Alignment), &getTargetParser().getSTI(), MaxBytesToFill);
   } else {
diff --git a/llvm/test/MC/AArch64/align-fill-byte-zero.s b/llvm/test/MC/AArch64/align-fill-byte-zero.s
new file mode 100644
index 0000000000000..eed8750845568
--- /dev/null
+++ b/llvm/test/MC/AArch64/align-fill-byte-zero.s
@@ -0,0 +1,22 @@
+// RUN: llvm-mc -triple aarch64 %s -o - | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -triple aarch64 -filetype obj %s -o - | \
+// RUN:   llvm-objdump -dz - | FileCheck %s --check-prefix=OBJ
+
+// llvm.org/pr30955 - LLVM was handling `.balign <alignment>, 0` strangely on
+// non-x86 targets.
+
+  .text
+
+// ASM: add     x14, x14, #1
+// OBJ: 910005ce      add     x14, x14, #0x1
+  add x14, x14, 0x1
+
+// ASM: .p2align 4, 0x0
+// OBJ-NEXT: 00000000      udf     #0x0
+// OBJ-NEXT: 00000000      udf     #0x0
+// OBJ-NEXT: 00000000      udf     #0x0
+  .balign 0x10, 0
+
+// ASM: add     x14, x14, #1
+// OBJ-NEXT: 910005ce      add     x14, x14, #0x1
+  add x14, x14, 0x1
diff --git a/llvm/test/MC/ARM/align-fill-byte-zero.s b/llvm/test/MC/ARM/align-fill-byte-zero.s
new file mode 100644
index 0000000000000..cc1e27e80e920
--- /dev/null
+++ b/llvm/test/MC/ARM/align-fill-byte-zero.s
@@ -0,0 +1,40 @@
+// RUN: llvm-mc -triple armv7a %s -o - | FileCheck %s --check-prefix=ASM-ARM
+// RUN: llvm-mc -triple armv7a -filetype obj %s -o - | \
+// RUN:   llvm-objdump --triple=armv7a -dz - | FileCheck %s --check-prefix=OBJ-ARM
+
+// RUN: llvm-mc -triple thumbv7a %s -o - | FileCheck %s --check-prefix=ASM-THUMB
+// RUN: llvm-mc -triple thumbv7a -filetype obj %s -o - | \
+// RUN:   llvm-objdump --triple=thumbv7a -dz - | FileCheck %s --check-prefix=OBJ-THUMB
+
+// llvm.org/pr30955 - LLVM was handling `.balign <alignment>, 0` strangely on
+// non-x86 targets.
+
+  .text
+
+// ASM-ARM: add     r0, r0, #1
+// OBJ-ARM: e2800001      add     r0, r0, #1
+
+// ASM-THUMB: add.w   r0, r0, #1
+// OBJ-THUMB: f100 0001      add.w     r0, r0, #0x1
+  add r0, r0, 0x1
+
+// ASM-ARM: .p2align 4, 0x0
+// OBJ-ARM-NEXT: 00000000      andeq   r0, r0, r0
+// OBJ-ARM-NEXT: 00000000      andeq   r0, r0, r0
+// OBJ-ARM-NEXT: 00000000      andeq   r0, r0, r0
+
+// ASM-THUMB: .p2align 4, 0x0
+// OBJ-THUMB-NEXT: 0000          movs    r0, r0
+// OBJ-THUMB-NEXT: 0000          movs    r0, r0
+// OBJ-THUMB-NEXT: 0000          movs    r0, r0
+// OBJ-THUMB-NEXT: 0000          movs    r0, r0
+// OBJ-THUMB-NEXT: 0000          movs    r0, r0
+// OBJ-THUMB-NEXT: 0000          movs    r0, r0
+  .balign 0x10, 0
+
+// ASM-ARM: add     r0, r0, #1
+// OBJ-ARM-NEXT: e2800001      add     r0, r0, #1
+
+// ASM-THUMB: add.w   r0, r0, #1
+// OBJ-THUMB-NEXT: f100 0001      add.w     r0, r0, #0x1
+  add r0, r0, 0x1
diff --git a/llvm/test/MC/AsmParser/directive_align.s b/llvm/test/MC/AsmParser/directive_align.s
index 0430f51740861..d71f559fd03da 100644
--- a/llvm/test/MC/AsmParser/directive_align.s
+++ b/llvm/test/MC/AsmParser/directive_align.s
@@ -1,6 +1,8 @@
 # RUN: not llvm-mc -triple i386-apple-darwin9 %s 2> %t.err | FileCheck %s
 # RUN: FileCheck < %t.err %s --check-prefix=CHECK-WARN
 
+        .data
+
 # CHECK: TEST0:
 # CHECK: .p2align 1
 TEST0:  
diff --git a/llvm/test/MC/COFF/align-nops.s b/llvm/test/MC/COFF/align-nops.s
index d6906d07148b6..7264d70938cea 100644
--- a/llvm/test/MC/COFF/align-nops.s
+++ b/llvm/test/MC/COFF/align-nops.s
@@ -4,7 +4,7 @@
     .text
 f0:
     .long 0
-    .align  8, 0x90
+    .align  8
     .long 0
     .align  8
 
diff --git a/llvm/test/MC/ELF/align-nops.s b/llvm/test/MC/ELF/align-nops.s
index 41a007111f406..5ce03d5a08aff 100644
--- a/llvm/test/MC/ELF/align-nops.s
+++ b/llvm/test/MC/ELF/align-nops.s
@@ -4,14 +4,14 @@
     .text
 f0:
     .long 0
-    .align  8, 0x00000090
+    .align  8
     .long 0
     .align  8
 
 // But not in another section
     .data
     .long 0
-    .align  8, 0x00000090
+    .align  8, 0x90
     .long 0
     .align  8
 
diff --git a/llvm/test/MC/MachO/x86_32-optimal_nop.s b/llvm/test/MC/MachO/x86_32-optimal_nop.s
index bb784b9da9954..19655b5405152 100644
--- a/llvm/test/MC/MachO/x86_32-optimal_nop.s
+++ b/llvm/test/MC/MachO/x86_32-optimal_nop.s
@@ -5,7 +5,7 @@
         ret
         # nop
         # 0x90
-        .align 1, 0x90
+        .align 1
         ret
 # 2 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -13,14 +13,14 @@
         ret
         # xchg %ax,%ax
         # 0x66, 0x90
-        .align 2, 0x90
+        .align 2
         ret
 # 3 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
         ret
         # nopl (%[re]ax)
         # 0x0f, 0x1f, 0x00
-        .align 2, 0x90
+        .align 2
         ret
 # 4 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -30,7 +30,7 @@
         ret
         # nopl 0(%[re]ax)
         # 0x0f, 0x1f, 0x40, 0x00
-        .align 3, 0x90
+        .align 3
         ret
 # 5 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -39,7 +39,7 @@
         ret
         # nopl 0(%[re]ax,%[re]ax,1)
         # 0x0f, 0x1f, 0x44, 0x00, 0x00
-        .align 3, 0x90
+        .align 3
         ret
 # 6 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -47,14 +47,14 @@
         ret
         # nopw 0(%[re]ax,%[re]ax,1)
         # 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00
-        .align 3, 0x90
+        .align 3
         ret
 # 7 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
         ret
         # nopl 0L(%[re]ax)
         # 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00
-        .align 3, 0x90
+        .align 3
         ret
 # 8 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -68,7 +68,7 @@
         ret
         # nopl 0L(%[re]ax,%[re]ax,1)
         # 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
-        .align 3, 0x90
+        .align 3
         ret
 # 9 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -81,7 +81,7 @@
         ret
         # nopw 0L(%[re]ax,%[re]ax,1)
         # 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
-        .align 4, 0x90
+        .align 4
         ret
 # 10 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -94,7 +94,7 @@
         ret
         # nopw %cs:0L(%[re]ax,%[re]ax,1)
         # 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
-        .align 4, 0x90
+        .align 4
         ret
 # 11 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -105,7 +105,7 @@
         ret
         # nopw %cs:0L(%[re]ax,%[re]ax,1)
         # 0x66, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
-        .align 4, 0x90
+        .align 4
         ret
 # 12 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -117,7 +117,7 @@
         # nopw 0(%[re]ax,%[re]ax,1)
         # 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00,
         # 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00
-        .align 4, 0x90
+        .align 4
         ret
 # 13 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -128,7 +128,7 @@
         # nopl 0L(%[re]ax)
         # 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00,
         # 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00
-        .align 4, 0x90
+        .align 4
         ret
 # 14 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -138,7 +138,7 @@
         # nopl 0L(%[re]ax)
         # 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00,
         # 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00
-        .align 4, 0x90
+        .align 4
         ret
 # 15 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
@@ -147,7 +147,7 @@
         # nopl 0L(%[re]ax,%[re]ax,1)
         # 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00,
         # 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
-        .align 4, 0x90
+        .align 4
         ret
 
         # Only the .text sections gets optimal nops.
diff --git a/llvm/test/MC/RISCV/align-fill-byte-zero.s b/llvm/test/MC/RISCV/align-fill-byte-zero.s
new file mode 100644
index 0000000000000..cad881ea4e82c
--- /dev/null
+++ b/llvm/test/MC/RISCV/align-fill-byte-zero.s
@@ -0,0 +1,29 @@
+// RUN: llvm-mc -triple riscv32 %s -o - | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -triple riscv32 -filetype obj %s -o - | \
+// RUN:   llvm-objdump -dz - | FileCheck %s --check-prefix=OBJ
+
+// RUN: llvm-mc -triple riscv64 %s -o - | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -triple riscv64 -filetype obj %s -o - | \
+// RUN:   llvm-objdump -dz - | FileCheck %s --check-prefix=OBJ
+
+// llvm.org/pr30955 - LLVM was handling `.balign <alignment>, 0` strangely on
+// non-x86 targets.
+
+  .text
+
+// ASM: addi     a0, a0, 1
+// OBJ: 00150513      addi     a0, a0, 0x1
+  addi a0, a0, 0x1
+
+// ASM: .p2align 4, 0x0
+// OBJ-NEXT: 0000          <unknown>
+// OBJ-NEXT: 0000          <unknown>
+// OBJ-NEXT: 0000          <unknown>
+// OBJ-NEXT: 0000          <unknown>
+// OBJ-NEXT: 0000          <unknown>
+// OBJ-NEXT: 0000          <unknown>
+  .balign 0x10, 0
+
+// ASM: addi     a0, a0, 1
+// OBJ-NEXT: 00150513      addi     a0, a0, 0x1
+  addi a0, a0, 0x1
diff --git a/llvm/test/MC/X86/code16gcc-align.s b/llvm/test/MC/X86/code16gcc-align.s
index 657364a604275..107986a9ec70e 100644
--- a/llvm/test/MC/X86/code16gcc-align.s
+++ b/llvm/test/MC/X86/code16gcc-align.s
@@ -21,19 +21,19 @@
 	.text
 	.code16gcc
 	.globl	test
-	.p2align	4, 0x90
+	.p2align	4
 	.type	test, at function
 test:
 	.nops	34
 	movl	%eax, %edi
 	xorl	%ebx, %ebx
-	.p2align	4, 0x90
+	.p2align	4
 	movzbl	(%esi,%ebx), %ecx
 	calll	called
 	.nops	3
 	retl
 
-	.p2align	4, 0x90
+	.p2align	4
 	.type	called, at function
 called:
 	.nops	1

>From 97839a8862bde89f07060ff0aa4993a868a501cd Mon Sep 17 00:00:00 2001
From: Andy Kaylor <andrew.kaylor at intel.com>
Date: Wed, 24 Jul 2024 10:25:03 -0700
Subject: [PATCH 048/259] [Driver] Clean up fp-contract handling in clang
 driver (#99723)

This change refactors the fp-contract handling in
RenderFloatingPointOptions in the clang driver code and fixes some
inconsistencies in the way warnings are reported for changes in the
fp-contract behavior.
---
 clang/lib/Driver/ToolChains/Clang.cpp |  90 ++++++++++++---------
 clang/test/Driver/fp-contract.c       | 108 +++++++++++++++-----------
 clang/test/Driver/fp-model.c          |  13 ++++
 3 files changed, 130 insertions(+), 81 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index c9cb87031f4f8..df1bb8e9ee308 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -2867,6 +2867,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   // If one wasn't given by the user, don't pass it here.
   StringRef FPContract;
   StringRef LastSeenFfpContractOption;
+  StringRef LastFpContractOverrideOption;
   bool SeenUnsafeMathModeOption = false;
   if (!JA.isDeviceOffloading(Action::OFK_Cuda) &&
       !JA.isOffloading(Action::OFK_HIP))
@@ -2908,6 +2909,27 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
     SeenUnsafeMathModeOption = true;
   };
 
+  // Lambda to consolidate common handling for fp-contract
+  auto restoreFPContractState = [&]() {
+    // CUDA and HIP don't rely on the frontend to pass an ffp-contract option.
+    // For other targets, if the state has been changed by one of the
+    // unsafe-math umbrella options a subsequent -fno-fast-math or
+    // -fno-unsafe-math-optimizations option reverts to the last value seen for
+    // the -ffp-contract option or "on" if we have not seen the -ffp-contract
+    // option. If we have not seen an unsafe-math option or -ffp-contract,
+    // we leave the FPContract state unchanged.
+    if (!JA.isDeviceOffloading(Action::OFK_Cuda) &&
+        !JA.isOffloading(Action::OFK_HIP)) {
+      if (LastSeenFfpContractOption != "")
+        FPContract = LastSeenFfpContractOption;
+      else if (SeenUnsafeMathModeOption)
+        FPContract = "on";
+    }
+    // In this case, we're reverting to the last explicit fp-contract option
+    // or the platform default
+    LastFpContractOverrideOption = "";
+  };
+
   if (const Arg *A = Args.getLastArg(options::OPT_flimited_precision_EQ)) {
     CmdArgs.push_back("-mlimit-float-precision");
     CmdArgs.push_back(A->getValue());
@@ -3009,7 +3031,6 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       AssociativeMath = false;
       ReciprocalMath = false;
       SignedZeros = true;
-      FPContract = "on";
 
       StringRef Val = A->getValue();
       if (OFastEnabled && Val != "fast") {
@@ -3026,14 +3047,18 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       if (Val == "fast") {
         FPModel = Val;
         applyFastMath();
+        // applyFastMath sets fp-contract="fast"
+        LastFpContractOverrideOption = "-ffp-model=fast";
       } else if (Val == "precise") {
         FPModel = Val;
         FPContract = "on";
+        LastFpContractOverrideOption = "-ffp-model=precise";
       } else if (Val == "strict") {
         StrictFPModel = true;
         FPExceptionBehavior = "strict";
         FPModel = Val;
         FPContract = "off";
+        LastFpContractOverrideOption = "-ffp-model=strict";
         TrappingMath = true;
         RoundingFPMath = true;
       } else
@@ -3112,8 +3137,15 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       StringRef Val = A->getValue();
       if (Val == "fast" || Val == "on" || Val == "off" ||
           Val == "fast-honor-pragmas") {
+        if (Val != FPContract && LastFpContractOverrideOption != "") {
+          D.Diag(clang::diag::warn_drv_overriding_option)
+              << LastFpContractOverrideOption
+              << Args.MakeArgString("-ffp-contract=" + Val);
+        }
+
         FPContract = Val;
         LastSeenFfpContractOption = Val;
+        LastFpContractOverrideOption = "";
       } else
         D.Diag(diag::err_drv_unsupported_option_argument)
             << A->getSpelling() << Val;
@@ -3192,6 +3224,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       TrappingMath = false;
       FPExceptionBehavior = "";
       FPContract = "fast";
+      LastFpContractOverrideOption = "-funsafe-math-optimizations";
       SeenUnsafeMathModeOption = true;
       break;
     case options::OPT_fno_unsafe_math_optimizations:
@@ -3199,14 +3232,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       ReciprocalMath = false;
       SignedZeros = true;
       ApproxFunc = false;
-
-      if (!JA.isDeviceOffloading(Action::OFK_Cuda) &&
-          !JA.isOffloading(Action::OFK_HIP)) {
-        if (LastSeenFfpContractOption != "") {
-          FPContract = LastSeenFfpContractOption;
-        } else if (SeenUnsafeMathModeOption)
-          FPContract = "on";
-      }
+      restoreFPContractState();
       break;
 
     case options::OPT_Ofast:
@@ -3214,10 +3240,13 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       if (!OFastEnabled)
         continue;
       [[fallthrough]];
-    case options::OPT_ffast_math: {
+    case options::OPT_ffast_math:
       applyFastMath();
+      if (A->getOption().getID() == options::OPT_Ofast)
+        LastFpContractOverrideOption = "-Ofast";
+      else
+        LastFpContractOverrideOption = "-ffast-math";
       break;
-    }
     case options::OPT_fno_fast_math:
       HonorINFs = true;
       HonorNaNs = true;
@@ -3229,16 +3258,11 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       ReciprocalMath = false;
       ApproxFunc = false;
       SignedZeros = true;
-      // -fno_fast_math restores default fpcontract handling
-      if (!JA.isDeviceOffloading(Action::OFK_Cuda) &&
-          !JA.isOffloading(Action::OFK_HIP)) {
-        if (LastSeenFfpContractOption != "") {
-          FPContract = LastSeenFfpContractOption;
-        } else if (SeenUnsafeMathModeOption)
-          FPContract = "on";
-      }
+      restoreFPContractState();
+      LastFpContractOverrideOption = "";
       break;
-    }
+    } // End switch (A->getOption().getID())
+
     // The StrictFPModel local variable is needed to report warnings
     // in the way we intend. If -ffp-model=strict has been used, we
     // want to report a warning for the next option encountered that
@@ -3256,12 +3280,17 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       else {
         StrictFPModel = false;
         FPModel = "";
+        // The warning for -ffp-contract would have been reported by the
+        // OPT_ffp_contract_EQ handler above. A special check here is needed
+        // to avoid duplicating the warning.
         auto RHS = (A->getNumValues() == 0)
                        ? A->getSpelling()
                        : Args.MakeArgString(A->getSpelling() + A->getValue());
-        if (RHS != "-ffp-model=strict")
-          D.Diag(clang::diag::warn_drv_overriding_option)
-              << "-ffp-model=strict" << RHS;
+        if (A->getSpelling() != "-ffp-contract=") {
+          if (RHS != "-ffp-model=strict")
+            D.Diag(clang::diag::warn_drv_overriding_option)
+                << "-ffp-model=strict" << RHS;
+        }
       }
     }
 
@@ -3343,21 +3372,8 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   // individual features enabled by -ffast-math instead of the option itself as
   // that's consistent with gcc's behaviour.
   if (!HonorINFs && !HonorNaNs && !MathErrno && AssociativeMath && ApproxFunc &&
-      ReciprocalMath && !SignedZeros && !TrappingMath && !RoundingFPMath) {
+      ReciprocalMath && !SignedZeros && !TrappingMath && !RoundingFPMath)
     CmdArgs.push_back("-ffast-math");
-    if (FPModel == "fast") {
-      if (FPContract == "fast")
-        // All set, do nothing.
-        ;
-      else if (FPContract.empty())
-        // Enable -ffp-contract=fast
-        CmdArgs.push_back(Args.MakeArgString("-ffp-contract=fast"));
-      else
-        D.Diag(clang::diag::warn_drv_overriding_option)
-            << "-ffp-model=fast"
-            << Args.MakeArgString("-ffp-contract=" + FPContract);
-    }
-  }
 
   // Handle __FINITE_MATH_ONLY__ similarly.
   if (!HonorINFs && !HonorNaNs)
diff --git a/clang/test/Driver/fp-contract.c b/clang/test/Driver/fp-contract.c
index e2691dc211cc3..cab63683ee813 100644
--- a/clang/test/Driver/fp-contract.c
+++ b/clang/test/Driver/fp-contract.c
@@ -2,6 +2,14 @@
 // the options -ffast-math, -fno-fast-math, funsafe-math-optimizations,
 // fno-unsafe-math-optimizations.
 
+// These warning checks are above the run lines because the warning is reported
+// before the drive options that are checked below the run lines.
+// WARN_FM_OFF: warning: overriding '-ffast-math' option with '-ffp-contract=off'
+// WARN_FM_ON: warning: overriding '-ffast-math' option with '-ffp-contract=on'
+// WARN_FM_FHP: warning: overriding '-ffast-math' option with '-ffp-contract=fast-honor-pragmas'
+// WARN_UM_OFF: warning: overriding '-funsafe-math-optimizations' option with '-ffp-contract=off'
+// WARN_UM_ON: warning: overriding '-funsafe-math-optimizations' option with '-ffp-contract=on'
+
 // ffast-math, fno-fast-math
 // RUN: %clang -### -ffast-math -c %s 2>&1  \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
@@ -10,19 +18,19 @@
 // RUN: %clang -### -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -Werror -ffast-math -ffp-contract=on -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
+// RUN: %clang -### -ffast-math -ffp-contract=on -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-ON,WARN_FM_ON %s
 // CHECK-FPC-ON:       "-ffp-contract=on"
 
-// RUN: %clang -### -Werror -ffast-math -ffp-contract=off -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
+// RUN: %clang -### -ffast-math -ffp-contract=off -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-OFF,WARN_FM_OFF %s
 // CHECK-FPC-OFF:      "-ffp-contract=off"
 
 // RUN: %clang -### -Werror -ffast-math -ffp-contract=fast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -Werror -ffast-math -ffp-contract=fast-honor-pragmas -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST-HONOR %s
+// RUN: %clang -### -ffast-math -ffp-contract=fast-honor-pragmas -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-FAST-HONOR,WARN_FM_FHP %s
 // CHECK-FPC-FAST-HONOR:     "-ffp-contract=fast-honor-pragmas"
 
 // RUN: %clang -### -Werror -ffp-contract=fast -ffast-math -c %s 2>&1 \
@@ -43,23 +51,23 @@
 // RUN: %clang -### -Werror -ffast-math -ffp-contract=fast -ffp-contract=on -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -Werror -ffast-math -ffp-contract=on -ffp-contract=off -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
+// RUN: %clang -### -ffast-math -ffp-contract=on -ffp-contract=off -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-OFF,WARN_FM_ON %s
 
-// RUN: %clang -### -Werror -ffast-math -ffp-contract=on -ffp-contract=fast -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
+// RUN: %clang -### -ffast-math -ffp-contract=on -ffp-contract=fast -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-FAST,WARN_FM_ON %s
 
-// RUN: %clang -### -Werror -ffast-math -ffp-contract=off -ffp-contract=on -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
+// RUN: %clang -### -ffast-math -ffp-contract=off -ffp-contract=on -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-ON,WARN_FM_OFF %s
 
-// RUN: %clang -### -Werror -ffast-math -ffp-contract=off -ffp-contract=fast \
-// RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-FAST %s
+// RUN: %clang -### -ffast-math -ffp-contract=off -ffp-contract=fast \
+// RUN: -c %s 2>&1 | FileCheck --check-prefixes=CHECK-FPC-FAST,WARN_FM_OFF %s
 
-// RUN: %clang -### -Werror -ffast-math -ffp-contract=on -fno-fast-math -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
+// RUN: %clang -### -ffast-math -ffp-contract=on -fno-fast-math -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-ON,WARN_FM_ON %s
 
-// RUN: %clang -### -Werror -ffast-math -ffp-contract=off -fno-fast-math -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
+// RUN: %clang -### -ffast-math -ffp-contract=off -fno-fast-math -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-OFF,WARN_FM_OFF %s
 
 // RUN: %clang -### -Werror -ffast-math -ffp-contract=fast -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
@@ -112,24 +120,24 @@
 // RUN: %clang -### -Werror -fno-fast-math -ffast-math -ffp-contract=fast \
 // RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -Werror -fno-fast-math -ffast-math -ffp-contract=on \
-// RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-ON %s
+// RUN: %clang -### -fno-fast-math -ffast-math -ffp-contract=on \
+// RUN: -c %s 2>&1 | FileCheck --check-prefixes=CHECK-FPC-ON,WARN_FM_ON %s
 
-// RUN: %clang -### -Werror -fno-fast-math -ffast-math -ffp-contract=off \
-// RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-OFF %s
+// RUN: %clang -### -fno-fast-math -ffast-math -ffp-contract=off \
+// RUN: -c %s 2>&1 | FileCheck --check-prefixes=CHECK-FPC-OFF,WARN_FM_OFF %s
 
 // funsafe-math-optimizations, fno-unsafe-math-optimizations
-// RUN: %clang -### -funsafe-math-optimizations -c %s 2>&1  \
+// RUN: %clang -### -Werror -funsafe-math-optimizations -c %s 2>&1  \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -fno-unsafe-math-optimizations -c %s 2>&1 \
+// RUN: %clang -### -Werror -fno-unsafe-math-optimizations -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=on -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
+// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=on -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-ON,WARN_UM_ON %s
 
-// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=off -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
+// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=off -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-OFF,WARN_UM_OFF %s
 
 // RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=fast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
@@ -151,27 +159,27 @@
 // RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=fast \
 // RUN: -ffp-contract=on -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=on \
-// RUN: -ffp-contract=off -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-OFF %s
+// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=on \
+// RUN: -ffp-contract=off -c %s 2>&1 | FileCheck --check-prefixes=CHECK-FPC-OFF,WARN_UM_ON %s
 
-// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=on \
+// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=on \
 // RUN: -ffp-contract=fast -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-FAST,WARN_UM_ON %s
 
-// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=off \
-// RUN: -ffp-contract=on -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-ON %s
+// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=off \
+// RUN: -ffp-contract=on -c %s 2>&1 | FileCheck --check-prefixes=CHECK-FPC-ON,WARN_UM_OFF %s
 
-// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=off \
+// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=off \
 // RUN: -ffp-contract=fast \
-// RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-FAST %s
+// RUN: -c %s 2>&1 | FileCheck --check-prefixes=CHECK-FPC-FAST,WARN_UM_OFF %s
 
-// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=on \
+// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=on \
 // RUN: -fno-unsafe-math-optimizations -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-ON,WARN_UM_ON %s
 
-// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=off \
+// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=off \
 // RUN: -fno-unsafe-math-optimizations -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
+// RUN:   | FileCheck --check-prefixes=CHECK-FPC-OFF,WARN_UM_OFF %s
 
 // RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=fast \
 // RUN: -fno-unsafe-math-optimizations -c %s 2>&1 \
@@ -229,9 +237,21 @@
 // RUN:   -ffp-contract=fast \
 // RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -Werror -fno-unsafe-math-optimizations -funsafe-math-optimizations \
-// RUN:   -ffp-contract=on -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-ON %s
+// RUN: %clang -### -fno-unsafe-math-optimizations -funsafe-math-optimizations \
+// RUN:   -ffp-contract=on -c %s 2>&1 | FileCheck --check-prefixes=CHECK-FPC-ON,WARN_UM_ON %s
 
-// RUN: %clang -### -Werror -fno-unsafe-math-optimizations -funsafe-math-optimizations \
-// RUN:   -ffp-contract=off -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-OFF %s
+// RUN: %clang -### -fno-unsafe-math-optimizations -funsafe-math-optimizations \
+// RUN:   -ffp-contract=off -c %s 2>&1 | FileCheck --check-prefixes=CHECK-FPC-OFF,WARN_UM_OFF %s
+
+// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=off -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARN_UM_OFF %s
+
+// This case should not warn
+// RUN: %clang -### -Werror -funsafe-math-optimizations \
+// RUN: -fno-unsafe-math-optimizations -ffp-contract=off -c %s
+
+// RUN: %clang -### -ffast-math -ffp-contract=off -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARN_FM_OFF %s
 
+// This case should not warn
+// RUN: %clang -### -Werror -ffast-math -fno-fast-math -ffp-contract=off -c %s
diff --git a/clang/test/Driver/fp-model.c b/clang/test/Driver/fp-model.c
index 644523394d6b1..2348d4b41f43a 100644
--- a/clang/test/Driver/fp-model.c
+++ b/clang/test/Driver/fp-model.c
@@ -81,6 +81,19 @@
 // RUN:   | FileCheck --check-prefix=WARN13 %s
 // WARN13: warning: overriding '-ffp-model=strict' option with '-fapprox-func' [-Woverriding-option]
 
+// RUN: %clang -### -ffp-model=precise -ffp-contract=off -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARN14 %s
+// WARN14: warning: overriding '-ffp-model=precise' option with '-ffp-contract=off' [-Woverriding-option]
+
+// RUN: %clang -### -ffp-model=precise -ffp-contract=fast -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARN15 %s
+// WARN15: warning: overriding '-ffp-model=precise' option with '-ffp-contract=fast' [-Woverriding-option]
+
+// RUN: %clang -### -ffp-model=strict -fassociative-math -ffp-contract=on \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefix=WARN16 %s
+// WARN16: warning: overriding '-ffp-model=strict' option with '-fassociative-math' [-Woverriding-option]
+// WARN16: warning: overriding '-ffp-model=strict' option with '-ffp-contract=on' [-Woverriding-option]
+
 // RUN: %clang -### -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NOROUND %s
 // CHECK-NOROUND: "-cc1"

>From 2e57e6366677390110f5382894c8afeba8da7419 Mon Sep 17 00:00:00 2001
From: Jonathan Peyton <jonathan.l.peyton at intel.com>
Date: Wed, 24 Jul 2024 12:25:21 -0500
Subject: [PATCH 049/259] [OpenMP][libomp] Fix tasking debug assert (#95823)

The debug assert is meant to check that the index is a valid which means
the runtime needs to check against the size of the array instead of the
number of threads. A free()-ed thread put back in the thread pool may
index into anywhere inside the task team's available array from 0 to
tt_max_threads potentially.

Fixes: #94260
---
 openmp/runtime/src/kmp_tasking.cpp            |  2 +-
 openmp/runtime/test/tasking/issue-94260-1.cpp | 66 +++++++++++++++++++
 openmp/runtime/test/tasking/issue-94260-2.c   | 51 ++++++++++++++
 3 files changed, 118 insertions(+), 1 deletion(-)
 create mode 100644 openmp/runtime/test/tasking/issue-94260-1.cpp
 create mode 100644 openmp/runtime/test/tasking/issue-94260-2.c

diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 936afb91ac2f8..03ce0dd752af1 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -3245,7 +3245,7 @@ static kmp_task_t *__kmp_steal_task(kmp_int32 victim_tid, kmp_int32 gtid,
   threads_data = task_team->tt.tt_threads_data;
   KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
   KMP_DEBUG_ASSERT(victim_tid >= 0);
-  KMP_DEBUG_ASSERT(victim_tid < task_team->tt.tt_nproc);
+  KMP_DEBUG_ASSERT(victim_tid < task_team->tt.tt_max_threads);
 
   victim_td = &threads_data[victim_tid];
   victim_thr = victim_td->td.td_thr;
diff --git a/openmp/runtime/test/tasking/issue-94260-1.cpp b/openmp/runtime/test/tasking/issue-94260-1.cpp
new file mode 100644
index 0000000000000..6f2bf2ae0a859
--- /dev/null
+++ b/openmp/runtime/test/tasking/issue-94260-1.cpp
@@ -0,0 +1,66 @@
+// RUN: %libomp-cxx-compile-and-run
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+// The number of times to run each test
+#define NTIMES 2
+
+// Every thread creates a single "increment" task
+void test_tasks() {
+  for (int i = 0; i < 100; ++i)
+#pragma omp task
+  {
+    int tid = omp_get_thread_num();
+  }
+}
+
+// Testing single level of parallelism with increment tasks
+void test_base(int nthreads) {
+#ifdef VERBOSE
+#pragma omp master
+  printf("    test_base(%d)\n", nthreads);
+#endif
+#pragma omp parallel num_threads(nthreads)
+  { test_tasks(); }
+}
+
+// Testing nested parallel with increment tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+void test_nest(int first, int second) {
+#ifdef VERBOSE
+#pragma omp master
+  printf("   test_nest(%d, %d)\n", first, second);
+#endif
+#pragma omp parallel num_threads(first)
+  {
+    for (int i = 0; i < 100; ++i)
+#pragma omp task
+    {
+      int tid = omp_get_thread_num();
+    }
+    test_base(second);
+  }
+}
+
+template <typename... Args>
+void run_ntimes(int n, void (*func)(Args...), Args... args) {
+  for (int i = 0; i < n; ++i) {
+    func(args...);
+  }
+}
+
+int main() {
+  omp_set_max_active_levels(5);
+
+  for (int i = 0; i < 100; ++i) {
+    run_ntimes(NTIMES, test_nest, 4, 3);
+    run_ntimes(NTIMES, test_nest, 2, 1);
+  }
+
+  printf("PASS\n");
+  return EXIT_SUCCESS;
+}
diff --git a/openmp/runtime/test/tasking/issue-94260-2.c b/openmp/runtime/test/tasking/issue-94260-2.c
new file mode 100644
index 0000000000000..c2f43f91f70f9
--- /dev/null
+++ b/openmp/runtime/test/tasking/issue-94260-2.c
@@ -0,0 +1,51 @@
+// RUN: %libomp-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+int test_omp_parallel_num_threads() {
+  int num_failed;
+  int threads;
+  int nthreads;
+  int max_threads = 0;
+
+  num_failed = 0;
+#pragma omp task
+  {}
+
+/* first we check how many threads are available */
+#pragma omp parallel
+  {
+#pragma omp task
+    {}
+#pragma omp master
+    max_threads = omp_get_num_threads();
+  }
+
+  /* we increase the number of threads from one to maximum:*/
+  for (threads = 1; threads <= max_threads; threads++) {
+    nthreads = 0;
+#pragma omp parallel reduction(+ : num_failed) num_threads(threads)
+    {
+#pragma omp task
+      {}
+      num_failed = num_failed + !(threads == omp_get_num_threads());
+#pragma omp atomic
+      nthreads += 1;
+    }
+    num_failed = num_failed + !(nthreads == threads);
+  }
+  return (!num_failed);
+}
+
+int main() {
+  int i;
+  int num_failed = 0;
+
+  for (i = 0; i < 100; i++) {
+    if (!test_omp_parallel_num_threads()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}

>From c628dbd030ff084fe5e06315ddf09a47182e9716 Mon Sep 17 00:00:00 2001
From: smanna12 <soumi.manna at intel.com>
Date: Wed, 24 Jul 2024 10:41:33 -0700
Subject: [PATCH 050/259] [Clang] Prevent null pointer dereference in
 designated initializer check (#97220)

This patch adds an assertion in clang::SemaObjC::BuildInstanceMessage()
to ensure getCurMethodDecl() returns a valid method declaration,
addressing a static analyzer finding.
---
 clang/lib/Sema/SemaExprObjC.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaExprObjC.cpp b/clang/lib/Sema/SemaExprObjC.cpp
index 7ccecf055feed..2751c7cec2842 100644
--- a/clang/lib/Sema/SemaExprObjC.cpp
+++ b/clang/lib/Sema/SemaExprObjC.cpp
@@ -3206,9 +3206,10 @@ ExprResult SemaObjC::BuildInstanceMessage(
     }
     if (!isDesignatedInitChain) {
       const ObjCMethodDecl *InitMethod = nullptr;
+      auto *CurMD = SemaRef.getCurMethodDecl();
+      assert(CurMD && "Current method declaration should not be null");
       bool isDesignated =
-          SemaRef.getCurMethodDecl()->isDesignatedInitializerForTheInterface(
-              &InitMethod);
+          CurMD->isDesignatedInitializerForTheInterface(&InitMethod);
       assert(isDesignated && InitMethod);
       (void)isDesignated;
       Diag(SelLoc, SuperLoc.isValid() ?

>From 8d3252a8987818171878a26e4298b4b5dbf2a7e9 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq at xs4all.nl>
Date: Wed, 24 Jul 2024 19:42:48 +0200
Subject: [PATCH 051/259] [libc++][spaceship] Implements X::iterator container
 requirements. (#99343)

This implements the requirements for the container iterator requirements
for array, deque, vector, and `vector<bool>`.

Implements:
- LWG3352 strong_equality isn't a thing

Implements parts of:
- P1614R2 The Mothership has Landed

Fixes: https://github.com/llvm/llvm-project/issues/62486
---
 libcxx/docs/Status/Cxx20Issues.csv            |  2 +-
 libcxx/docs/Status/SpaceshipProjects.csv      |  2 +-
 libcxx/include/__bit_reference                | 14 +++
 libcxx/include/__iterator/bounded_iter.h      | 24 +++++
 libcxx/include/__iterator/wrap_iter.h         | 23 +++++
 libcxx/include/deque                          | 29 +++++-
 .../bounded_iter/comparison.pass.cpp          | 14 ++-
 .../sequences/array/iterators.pass.cpp        | 18 ++++
 .../sequences/deque/iterators.pass.cpp        | 29 ++++++
 .../sequences/vector.bool/iterators.pass.cpp  | 37 ++++++++
 .../sequences/vector/iterators.pass.cpp       | 49 +++++++++-
 .../span.iterators/iterator.pass.cpp          | 92 +++++++++++++++++++
 .../string.view.iterators/iterators.pass.cpp  | 85 +++++++++++++++++
 libcxx/test/support/test_iterators.h          |  2 +
 14 files changed, 413 insertions(+), 7 deletions(-)
 create mode 100644 libcxx/test/std/containers/views/views.span/span.iterators/iterator.pass.cpp
 create mode 100644 libcxx/test/std/strings/string.view/string.view.iterators/iterators.pass.cpp

diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index 4f8118b5a35fb..55a0f85203a15 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -264,7 +264,7 @@
 "`3349 <https://wg21.link/LWG3349>`__","Missing ``__cpp_lib_constexpr_complex``\  for P0415R1","Prague","|Complete|","16.0"
 "`3350 <https://wg21.link/LWG3350>`__","Simplify return type of ``lexicographical_compare_three_way``\ ","Prague","|Complete|","17.0","|spaceship|"
 "`3351 <https://wg21.link/LWG3351>`__","``ranges::enable_safe_range``\  should not be constrained","Prague","|Complete|","15.0","|ranges|"
-"`3352 <https://wg21.link/LWG3352>`__","``strong_equality``\  isn't a thing","Prague","|Nothing To Do|","","|spaceship|"
+"`3352 <https://wg21.link/LWG3352>`__","``strong_equality``\  isn't a thing","Prague","|Complete|","19.0","|spaceship|"
 "`3354 <https://wg21.link/LWG3354>`__","``has_strong_structural_equality``\  has a meaningless definition","Prague","|Nothing To Do|","","|spaceship|"
 "`3355 <https://wg21.link/LWG3355>`__","The memory algorithms should support move-only input iterators introduced by P1207","Prague","|Complete|","15.0","|ranges|"
 "`3356 <https://wg21.link/LWG3356>`__","``__cpp_lib_nothrow_convertible``\  should be ``__cpp_lib_is_nothrow_convertible``\ ","Prague","|Complete|","12.0"
diff --git a/libcxx/docs/Status/SpaceshipProjects.csv b/libcxx/docs/Status/SpaceshipProjects.csv
index e1cf2044cfd78..4dc43cdbbd08f 100644
--- a/libcxx/docs/Status/SpaceshipProjects.csv
+++ b/libcxx/docs/Status/SpaceshipProjects.csv
@@ -83,7 +83,7 @@ Section,Description,Dependencies,Assignee,Complete
 "| `[string.view.synop] <https://wg21.link/string.view.synop>`_
 | `[string.view.comparison] <https://wg21.link/string.view.comparison>`_",| `basic_string_view <https://reviews.llvm.org/D130295>`_,None,Mark de Wever,|Complete|
 - `5.7 Clause 22: Containers library <https://wg21.link/p1614r2#clause-22-containers-library>`_,,,,
-| `[container.requirements.general] <https://wg21.link/container.requirements.general>`_,|,None,Unassigned,|Not Started|
+| `[container.requirements.general] <https://wg21.link/container.requirements.general>`_,|,None,Mark de Wever,|Complete|
 | `[array.syn] <https://wg21.link/array.syn>`_ (`general <https://wg21.link/container.opt.reqmts>`_),| `array <https://reviews.llvm.org/D132265>`_,[expos.only.func],"| Adrian Vogelsgesang
 | Hristo Hristov",|Complete|
 | `[deque.syn] <https://wg21.link/deque.syn>`_ (`general <https://wg21.link/container.opt.reqmts>`_),| `deque <https://reviews.llvm.org/D144821>`_,[expos.only.func],Hristo Hristov,|Complete|
diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference
index 606069d98be72..22637d4397412 100644
--- a/libcxx/include/__bit_reference
+++ b/libcxx/include/__bit_reference
@@ -16,6 +16,7 @@
 #include <__bit/countr.h>
 #include <__bit/invert_if.h>
 #include <__bit/popcount.h>
+#include <__compare/ordering.h>
 #include <__config>
 #include <__fwd/bit_reference.h>
 #include <__iterator/iterator_traits.h>
@@ -913,6 +914,7 @@ public:
     return __x.__seg_ == __y.__seg_ && __x.__ctz_ == __y.__ctz_;
   }
 
+#if _LIBCPP_STD_VER <= 17
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 friend bool
   operator!=(const __bit_iterator& __x, const __bit_iterator& __y) {
     return !(__x == __y);
@@ -937,6 +939,18 @@ public:
   operator>=(const __bit_iterator& __x, const __bit_iterator& __y) {
     return !(__x < __y);
   }
+#else  // _LIBCPP_STD_VER <= 17
+  _LIBCPP_HIDE_FROM_ABI constexpr friend strong_ordering
+  operator<=>(const __bit_iterator& __x, const __bit_iterator& __y) {
+    if (__x.__seg_ < __y.__seg_)
+      return strong_ordering::less;
+
+    if (__x.__seg_ == __y.__seg_)
+      return __x.__ctz_ <=> __y.__ctz_;
+
+    return strong_ordering::greater;
+  }
+#endif // _LIBCPP_STD_VER <= 17
 
 private:
   _LIBCPP_HIDE_FROM_ABI
diff --git a/libcxx/include/__iterator/bounded_iter.h b/libcxx/include/__iterator/bounded_iter.h
index ce0823b8c97e4..8a81c9ffbfc3f 100644
--- a/libcxx/include/__iterator/bounded_iter.h
+++ b/libcxx/include/__iterator/bounded_iter.h
@@ -11,6 +11,8 @@
 #define _LIBCPP___ITERATOR_BOUNDED_ITER_H
 
 #include <__assert>
+#include <__compare/ordering.h>
+#include <__compare/three_way_comparable.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
 #include <__memory/pointer_traits.h>
@@ -201,10 +203,15 @@ struct __bounded_iter {
   operator==(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT {
     return __x.__current_ == __y.__current_;
   }
+
+#if _LIBCPP_STD_VER <= 17
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR friend bool
   operator!=(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT {
     return __x.__current_ != __y.__current_;
   }
+#endif
+
+  // TODO(mordante) disable these overloads in the LLVM 20 release.
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR friend bool
   operator<(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT {
     return __x.__current_ < __y.__current_;
@@ -222,6 +229,23 @@ struct __bounded_iter {
     return __x.__current_ >= __y.__current_;
   }
 
+#if _LIBCPP_STD_VER >= 20
+  _LIBCPP_HIDE_FROM_ABI constexpr friend strong_ordering
+  operator<=>(__bounded_iter const& __x, __bounded_iter const& __y) noexcept {
+    if constexpr (three_way_comparable<_Iterator, strong_ordering>) {
+      return __x.__current_ <=> __y.__current_;
+    } else {
+      if (__x.__current_ < __y.__current_)
+        return strong_ordering::less;
+
+      if (__x.__current_ == __y.__current_)
+        return strong_ordering::equal;
+
+      return strong_ordering::greater;
+    }
+  }
+#endif // _LIBCPP_STD_VER >= 20
+
 private:
   template <class>
   friend struct pointer_traits;
diff --git a/libcxx/include/__iterator/wrap_iter.h b/libcxx/include/__iterator/wrap_iter.h
index 252d13b26c9e2..56183c0ee794d 100644
--- a/libcxx/include/__iterator/wrap_iter.h
+++ b/libcxx/include/__iterator/wrap_iter.h
@@ -10,6 +10,8 @@
 #ifndef _LIBCPP___ITERATOR_WRAP_ITER_H
 #define _LIBCPP___ITERATOR_WRAP_ITER_H
 
+#include <__compare/ordering.h>
+#include <__compare/three_way_comparable.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
 #include <__memory/addressof.h>
@@ -131,6 +133,7 @@ operator<(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXC
   return __x.base() < __y.base();
 }
 
+#if _LIBCPP_STD_VER <= 17
 template <class _Iter1>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool
 operator!=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) _NOEXCEPT {
@@ -142,7 +145,9 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool
 operator!=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXCEPT {
   return !(__x == __y);
 }
+#endif
 
+// TODO(mordante) disable these overloads in the LLVM 20 release.
 template <class _Iter1>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool
 operator>(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) _NOEXCEPT {
@@ -179,6 +184,24 @@ operator<=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEX
   return !(__y < __x);
 }
 
+#if _LIBCPP_STD_VER >= 20
+template <class _Iter1, class _Iter2>
+_LIBCPP_HIDE_FROM_ABI constexpr strong_ordering
+operator<=>(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) noexcept {
+  if constexpr (three_way_comparable_with<_Iter1, _Iter2, strong_ordering>) {
+    return __x.base() <=> __y.base();
+  } else {
+    if (__x.base() < __y.base())
+      return strong_ordering::less;
+
+    if (__x.base() == __y.base())
+      return strong_ordering::equal;
+
+    return strong_ordering::greater;
+  }
+}
+#endif // _LIBCPP_STD_VER >= 20
+
 template <class _Iter1, class _Iter2>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14
 #ifndef _LIBCPP_CXX03_LANG
diff --git a/libcxx/include/deque b/libcxx/include/deque
index 4fc994a6e229b..e73135a8647b9 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -376,10 +376,13 @@ public:
     return __x.__ptr_ == __y.__ptr_;
   }
 
+#if _LIBCPP_STD_VER <= 17
   _LIBCPP_HIDE_FROM_ABI friend bool operator!=(const __deque_iterator& __x, const __deque_iterator& __y) {
     return !(__x == __y);
   }
+#endif
 
+  // TODO(mordante) disable these overloads in the LLVM 20 release.
   _LIBCPP_HIDE_FROM_ABI friend bool operator<(const __deque_iterator& __x, const __deque_iterator& __y) {
     return __x.__m_iter_ < __y.__m_iter_ || (__x.__m_iter_ == __y.__m_iter_ && __x.__ptr_ < __y.__ptr_);
   }
@@ -396,6 +399,29 @@ public:
     return !(__x < __y);
   }
 
+#if _LIBCPP_STD_VER >= 20
+  _LIBCPP_HIDE_FROM_ABI friend strong_ordering operator<=>(const __deque_iterator& __x, const __deque_iterator& __y) {
+    if (__x.__m_iter_ < __y.__m_iter_)
+      return strong_ordering::less;
+
+    if (__x.__m_iter_ == __y.__m_iter_) {
+      if constexpr (three_way_comparable<pointer, strong_ordering>) {
+        return __x.__ptr_ <=> __y.__ptr_;
+      } else {
+        if (__x.__ptr_ < __y.__ptr_)
+          return strong_ordering::less;
+
+        if (__x.__ptr_ == __y.__ptr_)
+          return strong_ordering::equal;
+
+        return strong_ordering::greater;
+      }
+    }
+
+    return strong_ordering::greater;
+  }
+#endif // _LIBCPP_STD_VER >= 20
+
 private:
   _LIBCPP_HIDE_FROM_ABI explicit __deque_iterator(__map_iterator __m, pointer __p) _NOEXCEPT
       : __m_iter_(__m),
@@ -2530,8 +2556,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const deque<_Tp, _Allocator>& __x,
 template <class _Tp, class _Allocator>
 _LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Tp>
 operator<=>(const deque<_Tp, _Allocator>& __x, const deque<_Tp, _Allocator>& __y) {
-  return std::lexicographical_compare_three_way(
-      __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
+  return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
 }
 
 #endif // _LIBCPP_STD_VER <= 17
diff --git a/libcxx/test/libcxx/iterators/bounded_iter/comparison.pass.cpp b/libcxx/test/libcxx/iterators/bounded_iter/comparison.pass.cpp
index 9c5df5da55b9c..cef2157469c8f 100644
--- a/libcxx/test/libcxx/iterators/bounded_iter/comparison.pass.cpp
+++ b/libcxx/test/libcxx/iterators/bounded_iter/comparison.pass.cpp
@@ -11,6 +11,7 @@
 //
 // Comparison operators
 
+#include <concepts>
 #include <__iterator/bounded_iter.h>
 
 #include "test_iterators.h"
@@ -59,6 +60,12 @@ TEST_CONSTEXPR_CXX14 bool tests() {
     assert(iter1 >= iter1);
   }
 
+#if TEST_STD_VER >= 20
+  // P1614
+  std::same_as<std::strong_ordering> decltype(auto) r1 = iter1 <=> iter2;
+  assert(r1 == std::strong_ordering::less);
+#endif
+
   return true;
 }
 
@@ -69,8 +76,11 @@ int main(int, char**) {
 #endif
 
 #if TEST_STD_VER > 17
-  tests<contiguous_iterator<int*> >();
-  static_assert(tests<contiguous_iterator<int*> >(), "");
+  tests<contiguous_iterator<int*>>();
+  static_assert(tests<contiguous_iterator<int*>>());
+
+  tests<three_way_contiguous_iterator<int*>>();
+  static_assert(tests<three_way_contiguous_iterator<int*>>());
 #endif
 
   return 0;
diff --git a/libcxx/test/std/containers/sequences/array/iterators.pass.cpp b/libcxx/test/std/containers/sequences/array/iterators.pass.cpp
index 106bc45c70998..710994c68295e 100644
--- a/libcxx/test/std/containers/sequences/array/iterators.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/iterators.pass.cpp
@@ -148,6 +148,15 @@ TEST_CONSTEXPR_CXX17 bool tests()
             assert(std::rbegin(c)  != std::rend(c));
             assert(std::cbegin(c)  != std::cend(c));
             assert(std::crbegin(c) != std::crend(c));
+
+#  if TEST_STD_VER >= 20
+            // P1614 + LWG3352
+            std::same_as<std::strong_ordering> decltype(auto) r1 = ii1 <=> ii2;
+            assert(r1 == std::strong_ordering::equal);
+
+            std::same_as<std::strong_ordering> decltype(auto) r2 = cii <=> ii2;
+            assert(r2 == std::strong_ordering::equal);
+#  endif
         }
         {
             typedef std::array<int, 0> C;
@@ -189,6 +198,15 @@ TEST_CONSTEXPR_CXX17 bool tests()
             assert(std::rbegin(c)  == std::rend(c));
             assert(std::cbegin(c)  == std::cend(c));
             assert(std::crbegin(c) == std::crend(c));
+
+#  if TEST_STD_VER >= 20
+            // P1614 + LWG3352
+            std::same_as<std::strong_ordering> decltype(auto) r1 = ii1 <=> ii2;
+            assert(r1 == std::strong_ordering::equal);
+
+            std::same_as<std::strong_ordering> decltype(auto) r2 = cii <=> ii2;
+            assert(r2 == std::strong_ordering::equal);
+#  endif
         }
     }
 #endif
diff --git a/libcxx/test/std/containers/sequences/deque/iterators.pass.cpp b/libcxx/test/std/containers/sequences/deque/iterators.pass.cpp
index 1f06ffde41ac2..484a2961fdb0c 100644
--- a/libcxx/test/std/containers/sequences/deque/iterators.pass.cpp
+++ b/libcxx/test/std/containers/sequences/deque/iterators.pass.cpp
@@ -41,7 +41,27 @@ int main(int, char**)
     i = c.begin();
     C::const_iterator j;
     j = c.cbegin();
+
     assert(i == j);
+    assert(!(i != j));
+
+    assert(!(i < j));
+    assert((i <= j));
+
+    assert(!(i > j));
+    assert((i >= j));
+
+#  if TEST_STD_VER >= 20
+    // P1614 + LWG3352
+    // When the allocator does not have operator<=> then the iterator uses a
+    // fallback to provide operator<=>.
+    // Make sure to test with an allocator that does not have operator<=>.
+    static_assert(!std::three_way_comparable<min_allocator<int>, std::strong_ordering>);
+    static_assert(std::three_way_comparable<typename C::iterator, std::strong_ordering>);
+
+    std::same_as<std::strong_ordering> decltype(auto) r1 = i <=> j;
+    assert(r1 == std::strong_ordering::equal);
+#  endif
     }
 #endif
 #if TEST_STD_VER > 11
@@ -74,6 +94,15 @@ int main(int, char**)
 //         assert ( cii != c.begin());
 //         assert ( cii != c.cend());
 //         assert ( ii1 != c.end());
+
+#  if TEST_STD_VER >= 20
+        // P1614 + LWG3352
+        std::same_as<std::strong_ordering> decltype(auto) r1 = ii1 <=> ii2;
+        assert(r1 == std::strong_ordering::equal);
+
+        std::same_as<std::strong_ordering> decltype(auto) r2 = cii <=> ii2;
+        assert(r2 == std::strong_ordering::equal);
+#  endif // TEST_STD_VER > 20
     }
 #endif
 
diff --git a/libcxx/test/std/containers/sequences/vector.bool/iterators.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/iterators.pass.cpp
index 9aaaac7a5557f..1e4877e8d2443 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/iterators.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/iterators.pass.cpp
@@ -77,7 +77,21 @@ TEST_CONSTEXPR_CXX20 bool tests()
         C::iterator i = c.begin();
         C::iterator j = c.end();
         assert(std::distance(i, j) == 0);
+
         assert(i == j);
+        assert(!(i != j));
+
+        assert(!(i < j));
+        assert((i <= j));
+
+        assert(!(i > j));
+        assert((i >= j));
+
+#  if TEST_STD_VER >= 20
+        // P1614 + LWG3352
+        std::same_as<std::strong_ordering> decltype(auto) r = i <=> j;
+        assert(r == std::strong_ordering::equal);
+#  endif
     }
     {
         typedef bool T;
@@ -86,7 +100,21 @@ TEST_CONSTEXPR_CXX20 bool tests()
         C::const_iterator i = c.begin();
         C::const_iterator j = c.end();
         assert(std::distance(i, j) == 0);
+
         assert(i == j);
+        assert(!(i != j));
+
+        assert(!(i < j));
+        assert((i <= j));
+
+        assert(!(i > j));
+        assert((i >= j));
+
+#  if TEST_STD_VER >= 20
+        // P1614 + LWG3352
+        std::same_as<std::strong_ordering> decltype(auto) r = i <=> j;
+        assert(r == std::strong_ordering::equal);
+#  endif
     }
     {
         typedef bool T;
@@ -131,6 +159,15 @@ TEST_CONSTEXPR_CXX20 bool tests()
         assert ( (cii >= ii1 ));
         assert (cii - ii1 == 0);
         assert (ii1 - cii == 0);
+
+#  if TEST_STD_VER >= 20
+        // P1614 + LWG3352
+        std::same_as<std::strong_ordering> decltype(auto) r1 = ii1 <=> ii2;
+        assert(r1 == std::strong_ordering::equal);
+
+        std::same_as<std::strong_ordering> decltype(auto) r2 = cii <=> ii2;
+        assert(r2 == std::strong_ordering::equal);
+#  endif // TEST_STD_VER > 20
     }
 #endif
 
diff --git a/libcxx/test/std/containers/sequences/vector/iterators.pass.cpp b/libcxx/test/std/containers/sequences/vector/iterators.pass.cpp
index 70e0e35767e09..0aa7ad0d42ed7 100644
--- a/libcxx/test/std/containers/sequences/vector/iterators.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector/iterators.pass.cpp
@@ -87,7 +87,27 @@ TEST_CONSTEXPR_CXX20 bool tests()
         C::iterator i = c.begin();
         C::iterator j = c.end();
         assert(std::distance(i, j) == 0);
+
         assert(i == j);
+        assert(!(i != j));
+
+        assert(!(i < j));
+        assert((i <= j));
+
+        assert(!(i > j));
+        assert((i >= j));
+
+#  if TEST_STD_VER >= 20
+        // P1614 + LWG3352
+        // When the allocator does not have operator<=> then the iterator uses a
+        // fallback to provide operator<=>.
+        // Make sure to test with an allocator that does not have operator<=>.
+        static_assert(!std::three_way_comparable<min_allocator<int>, std::strong_ordering>);
+        static_assert(std::three_way_comparable<typename C::iterator, std::strong_ordering>);
+
+        std::same_as<std::strong_ordering> decltype(auto) r1 = i <=> j;
+        assert(r1 == std::strong_ordering::equal);
+#  endif
     }
     {
         typedef int T;
@@ -96,7 +116,26 @@ TEST_CONSTEXPR_CXX20 bool tests()
         C::const_iterator i = c.begin();
         C::const_iterator j = c.end();
         assert(std::distance(i, j) == 0);
+
         assert(i == j);
+        assert(!(i != j));
+
+        assert(!(i < j));
+        assert((i <= j));
+
+        assert(!(i > j));
+        assert((i >= j));
+
+#  if TEST_STD_VER >= 20
+        // When the allocator does not have operator<=> then the iterator uses a
+        // fallback to provide operator<=>.
+        // Make sure to test with an allocator that does not have operator<=>.
+        static_assert(!std::three_way_comparable<min_allocator<int>, std::strong_ordering>);
+        static_assert(std::three_way_comparable<typename C::iterator, std::strong_ordering>);
+
+        std::same_as<std::strong_ordering> decltype(auto) r1 = i <=> j;
+        assert(r1 == std::strong_ordering::equal);
+#  endif
     }
     {
         typedef int T;
@@ -164,8 +203,16 @@ TEST_CONSTEXPR_CXX20 bool tests()
         assert ( (cii >= ii1 ));
         assert (cii - ii1 == 0);
         assert (ii1 - cii == 0);
+#  if TEST_STD_VER >= 20
+        // P1614 + LWG3352
+        std::same_as<std::strong_ordering> decltype(auto) r1 = ii1 <=> ii2;
+        assert(r1 == std::strong_ordering::equal);
+
+        std::same_as<std::strong_ordering> decltype(auto) r2 = cii <=> ii2;
+        assert(r2 == std::strong_ordering::equal);
+#  endif // TEST_STD_VER > 20
     }
-#endif
+#endif // TEST_STD_VER > 11
 
     return true;
 }
diff --git a/libcxx/test/std/containers/views/views.span/span.iterators/iterator.pass.cpp b/libcxx/test/std/containers/views/views.span/span.iterators/iterator.pass.cpp
new file mode 100644
index 0000000000000..13a7628e6043d
--- /dev/null
+++ b/libcxx/test/std/containers/views/views.span/span.iterators/iterator.pass.cpp
@@ -0,0 +1,92 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <span>
+
+// class iterator
+
+#include <cassert>
+#include <concepts>
+#include <iterator>
+#include <span>
+#include <string>
+#include <version> // __cpp_lib_ranges_as_const is not defined in span.
+
+#include "test_macros.h"
+
+template <class T>
+constexpr void test_type() {
+  using C = std::span<T>;
+  typename C::iterator ii1{}, ii2{};
+  typename C::iterator ii4 = ii1;
+  // TODO Test against C++23 after implementing
+  //  P2278R4 cbegin should always return a constant iterator
+  // The means adjusting the #ifdef to guard against C++23.
+#ifdef __cpp_lib_ranges_as_const
+  typename C::const_iterator cii{};
+#endif
+  assert(ii1 == ii2);
+  assert(ii1 == ii4);
+#ifdef __cpp_lib_ranges_as_const
+  assert(ii1 == cii);
+#endif
+
+  assert(!(ii1 != ii2));
+#ifdef __cpp_lib_ranges_as_const
+  assert(!(ii1 != cii));
+#endif
+
+  T v;
+  C c{&v, 1};
+  assert(c.begin() == std::begin(c));
+  assert(c.rbegin() == std::rbegin(c));
+#ifdef __cpp_lib_ranges_as_const
+  assert(c.cbegin() == std::cbegin(c));
+  assert(c.crbegin() == std::crbegin(c));
+#endif
+
+  assert(c.end() == std::end(c));
+  assert(c.rend() == std::rend(c));
+#ifdef __cpp_lib_ranges_as_const
+  assert(c.cend() == std::cend(c));
+  assert(c.crend() == std::crend(c));
+#endif
+
+  assert(std::begin(c) != std::end(c));
+  assert(std::rbegin(c) != std::rend(c));
+#ifdef __cpp_lib_ranges_as_const
+  assert(std::cbegin(c) != std::cend(c));
+  assert(std::crbegin(c) != std::crend(c));
+#endif
+
+  // P1614 + LWG3352
+  std::same_as<std::strong_ordering> decltype(auto) r1 = ii1 <=> ii2;
+  assert(r1 == std::strong_ordering::equal);
+
+#ifdef __cpp_lib_ranges_as_const
+  std::same_as<std::strong_ordering> decltype(auto) r2 = cii <=> ii2;
+  assert(r2 == std::strong_ordering::equal);
+#endif
+}
+
+constexpr bool test() {
+  test_type<char>();
+  test_type<int>();
+  test_type<std::string>();
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test(), "");
+
+  return 0;
+}
diff --git a/libcxx/test/std/strings/string.view/string.view.iterators/iterators.pass.cpp b/libcxx/test/std/strings/string.view/string.view.iterators/iterators.pass.cpp
new file mode 100644
index 0000000000000..75d492bf7b3c6
--- /dev/null
+++ b/libcxx/test/std/strings/string.view/string.view.iterators/iterators.pass.cpp
@@ -0,0 +1,85 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: !stdlib=libc++ && (c++03 || c++11 || c++14)
+
+// <string_view>
+
+// class iterator
+
+#include <cassert>
+#include <concepts>
+#include <iterator>
+#include <string_view>
+
+#include "test_macros.h"
+#include "make_string.h"
+
+template <class CharT>
+TEST_CONSTEXPR_CXX14 void test_type() {
+  using C                  = std::basic_string_view<CharT>;
+  typename C::iterator ii1 = typename C::iterator(), ii2 = typename C::iterator();
+  typename C::iterator ii4       = ii1;
+  typename C::const_iterator cii = typename C::const_iterator();
+  assert(ii1 == ii2);
+  assert(ii1 == ii4);
+  assert(ii1 == cii);
+
+  assert(!(ii1 != ii2));
+  assert(!(ii1 != cii));
+
+#if TEST_STD_VER >= 17
+  C c = MAKE_STRING_VIEW(CharT, "abc");
+  assert(c.begin() == std::begin(c));
+  assert(c.rbegin() == std::rbegin(c));
+  assert(c.cbegin() == std::cbegin(c));
+  assert(c.crbegin() == std::crbegin(c));
+
+  assert(c.end() == std::end(c));
+  assert(c.rend() == std::rend(c));
+  assert(c.cend() == std::cend(c));
+  assert(c.crend() == std::crend(c));
+
+  assert(std::begin(c) != std::end(c));
+  assert(std::rbegin(c) != std::rend(c));
+  assert(std::cbegin(c) != std::cend(c));
+  assert(std::crbegin(c) != std::crend(c));
+#endif
+
+#if TEST_STD_VER >= 20
+  // P1614 + LWG3352
+  std::same_as<std::strong_ordering> decltype(auto) r1 = ii1 <=> ii2;
+  assert(r1 == std::strong_ordering::equal);
+
+  std::same_as<std::strong_ordering> decltype(auto) r2 = ii1 <=> ii2;
+  assert(r2 == std::strong_ordering::equal);
+#endif
+}
+
+TEST_CONSTEXPR_CXX14 bool test() {
+  test_type<char>();
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test_type<wchar_t>();
+#endif
+#ifndef TEST_HAS_NO_CHAR8_T
+  test_type<char8_t>();
+#endif
+  test_type<char16_t>();
+  test_type<char32_t>();
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 14
+  static_assert(test(), "");
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/support/test_iterators.h b/libcxx/test/support/test_iterators.h
index 31564a3977317..95d1b7df0007c 100644
--- a/libcxx/test/support/test_iterators.h
+++ b/libcxx/test/support/test_iterators.h
@@ -389,6 +389,8 @@ class contiguous_iterator
     friend TEST_CONSTEXPR bool operator> (const contiguous_iterator& x, const contiguous_iterator& y) {return x.it_ >  y.it_;}
     friend TEST_CONSTEXPR bool operator>=(const contiguous_iterator& x, const contiguous_iterator& y) {return x.it_ >= y.it_;}
 
+    // Note no operator<=>, use three_way_contiguous_iterator for testing operator<=>
+
     friend TEST_CONSTEXPR It base(const contiguous_iterator& i) { return i.it_; }
 
     template <class T>

>From fb1e077982e90e715058188fa667ea5dd8027a98 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 24 Jul 2024 12:54:44 -0500
Subject: [PATCH 052/259] [NVPTX] Restore old va_list builtin type (#100438)

Summary:
This was changed to `void *` from `char *` unintentionally, put it back.
---
 clang/lib/Basic/Targets/NVPTX.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h
index 25dc979d882fd..1ef20ce733917 100644
--- a/clang/lib/Basic/Targets/NVPTX.h
+++ b/clang/lib/Basic/Targets/NVPTX.h
@@ -119,7 +119,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo {
   }
 
   BuiltinVaListKind getBuiltinVaListKind() const override {
-    return TargetInfo::VoidPtrBuiltinVaList;
+    return TargetInfo::CharPtrBuiltinVaList;
   }
 
   bool isValidCPUName(StringRef Name) const override {

>From 3902710a447cd0dc39cc61f001fb023a502db499 Mon Sep 17 00:00:00 2001
From: gonzalobg <65027571+gonzalobg at users.noreply.github.com>
Date: Wed, 24 Jul 2024 19:59:03 +0200
Subject: [PATCH 053/259] [NVPTX] Support fence instruction (#99649)

---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 28 +++++++++++++++++++
 llvm/test/CodeGen/NVPTX/fence.ll        | 36 +++++++++++++++++++++++++
 2 files changed, 64 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/fence.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index cd17a9de541ad..f82013b63dc6a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3908,3 +3908,31 @@ def : Pat <
   (V2I32toI64
     (INT_NVVM_PRMT (I64toI32H Int64Regs:$a), (i32 0), (i32 0x0123)),
     (INT_NVVM_PRMT (I64toI32L Int64Regs:$a), (i32 0), (i32 0x0123)))>;
+
+
+////////////////////////////////////////////////////////////////////////////////
+// PTX Fence instructions
+////////////////////////////////////////////////////////////////////////////////
+
+def atomic_thread_fence_seq_cst_sys :
+  NVPTXInst<(outs), (ins), "fence.sc.sys;", []>,
+  Requires<[hasPTX<60>, hasSM<70>]>;
+def atomic_thread_fence_acq_rel_sys :
+  NVPTXInst<(outs), (ins), "fence.acq_rel.sys;", []>,
+  Requires<[hasPTX<60>, hasSM<70>]>;
+
+def : Pat<(atomic_fence (i64 4), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // acquire(4) sys(1)
+      Requires<[hasPTX<60>, hasSM<70>]>;
+def : Pat<(atomic_fence (i64 5), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // release(5) sys(1)
+      Requires<[hasPTX<60>, hasSM<70>]>;
+def : Pat<(atomic_fence (i64 6), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // acq_rel(6) sys(1)
+      Requires<[hasPTX<60>, hasSM<70>]>;
+def : Pat<(atomic_fence (i64 7), (i64 1)), (atomic_thread_fence_seq_cst_sys)>, // seq_cst(7) sys(1)
+      Requires<[hasPTX<60>, hasSM<70>]>;
+
+
+// If PTX<60 or SM<70, we fall back to MEMBAR:
+def : Pat<(atomic_fence (i64 4), (i64 1)), (INT_MEMBAR_SYS)>; // acquire(4) sys(1)
+def : Pat<(atomic_fence (i64 5), (i64 1)), (INT_MEMBAR_SYS)>; // release(5) sys(1)
+def : Pat<(atomic_fence (i64 6), (i64 1)), (INT_MEMBAR_SYS)>; // acq_rel(6) sys(1)
+def : Pat<(atomic_fence (i64 7), (i64 1)), (INT_MEMBAR_SYS)>; // seq_cst(7) sys(1)
diff --git a/llvm/test/CodeGen/NVPTX/fence.ll b/llvm/test/CodeGen/NVPTX/fence.ll
new file mode 100644
index 0000000000000..d3aace95e9665
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=SM60
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s  --check-prefix=SM70
+; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %}
+
+; CHECK-LABEL: fence_sc_sys
+define void @fence_sc_sys() local_unnamed_addr {
+  ; SM60: membar.sys
+  ; SM70: fence.sc.sys
+  fence seq_cst
+  ret void
+}
+
+; CHECK-LABEL: fence_acq_rel_sys
+define void @fence_acq_rel_sys() local_unnamed_addr {
+  ; SM60: membar.sys
+  ; SM70: fence.acq_rel.sys
+  fence acq_rel
+  ret void  
+}
+
+; CHECK-LABEL: fence_release_sys
+define void @fence_release_sys() local_unnamed_addr {
+  ; SM60: membar.sys
+  ; SM70: fence.acq_rel.sys  
+  fence release
+  ret void  
+}
+
+; CHECK-LABEL: fence_acquire_sys
+define void @fence_acquire_sys() local_unnamed_addr {
+  ; SM60: membar.sys
+  ; SM70: fence.acq_rel.sys  
+  fence acquire
+  ret void  
+}
\ No newline at end of file

>From f729c09c3a56ec41b558eff1a6475a8d8bd20d41 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Wed, 24 Jul 2024 11:05:05 -0700
Subject: [PATCH 054/259] MC: Inline createMachObjectWriter into MCAsmBackend

We could do the same to COFF once WinCOFFObjectWriter is cleaned up
(#100303).
---
 llvm/include/llvm/MC/MCMachObjectWriter.h | 12 ------------
 llvm/lib/MC/MCAsmBackend.cpp              |  4 ++--
 llvm/lib/MC/MachObjectWriter.cpp          |  7 -------
 3 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/llvm/include/llvm/MC/MCMachObjectWriter.h b/llvm/include/llvm/MC/MCMachObjectWriter.h
index 4df9c92ae368d..77652c7ff53a3 100644
--- a/llvm/include/llvm/MC/MCMachObjectWriter.h
+++ b/llvm/include/llvm/MC/MCMachObjectWriter.h
@@ -353,18 +353,6 @@ class MachObjectWriter final : public MCObjectWriter {
 
   uint64_t writeObject(MCAssembler &Asm) override;
 };
-
-/// Construct a new Mach-O writer instance.
-///
-/// This routine takes ownership of the target writer subclass.
-///
-/// \param MOTW - The target specific Mach-O writer subclass.
-/// \param OS - The stream to write to.
-/// \returns The constructed object writer.
-std::unique_ptr<MCObjectWriter>
-createMachObjectWriter(std::unique_ptr<MCMachObjectTargetWriter> MOTW,
-                       raw_pwrite_stream &OS, bool IsLittleEndian);
-
 } // end namespace llvm
 
 #endif // LLVM_MC_MCMACHOBJECTWRITER_H
diff --git a/llvm/lib/MC/MCAsmBackend.cpp b/llvm/lib/MC/MCAsmBackend.cpp
index 7433f41c9e0fb..bab3485b50614 100644
--- a/llvm/lib/MC/MCAsmBackend.cpp
+++ b/llvm/lib/MC/MCAsmBackend.cpp
@@ -35,8 +35,8 @@ MCAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
   bool IsLE = Endian == llvm::endianness::little;
   switch (TW->getFormat()) {
   case Triple::MachO:
-    return createMachObjectWriter(cast<MCMachObjectTargetWriter>(std::move(TW)),
-                                  OS, IsLE);
+    return std::make_unique<MachObjectWriter>(
+        cast<MCMachObjectTargetWriter>(std::move(TW)), OS, IsLE);
   case Triple::COFF:
     return createWinCOFFObjectWriter(
         cast<MCWinCOFFObjectTargetWriter>(std::move(TW)), OS);
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index 97fe254e4e979..8596f457574a3 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -1108,10 +1108,3 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
 
   return NumBytesWritten();
 }
-
-std::unique_ptr<MCObjectWriter>
-llvm::createMachObjectWriter(std::unique_ptr<MCMachObjectTargetWriter> MOTW,
-                             raw_pwrite_stream &OS, bool IsLittleEndian) {
-  return std::make_unique<MachObjectWriter>(std::move(MOTW), OS,
-                                             IsLittleEndian);
-}

>From f719a339a89bd64900217dd06075e618ed587be9 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie at gmail.com>
Date: Wed, 24 Jul 2024 18:07:24 +0000
Subject: [PATCH 055/259] Mark assert-used-only variable as [[maybe_unused]]

Seemed nicer than inlining, given the complexities of flag checking and
the multiple uses.
---
 llvm/lib/ExecutionEngine/Orc/Core.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
index 6db8e982894bc..a078bffa86f59 100644
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -933,7 +933,8 @@ Error JITDylib::resolve(MaterializationResponsibility &MR,
             SymbolsInErrorState.insert(KV.first);
           else {
             if (SymI->second.getFlags() & JITSymbolFlags::Common) {
-              auto WeakOrCommon = JITSymbolFlags::Weak | JITSymbolFlags::Common;
+              [[maybe_unused]] auto WeakOrCommon =
+                  JITSymbolFlags::Weak | JITSymbolFlags::Common;
               assert((KV.second.getFlags() & WeakOrCommon) &&
                      "Common symbols must be resolved as common or weak");
               assert((KV.second.getFlags() & ~WeakOrCommon) ==

>From 0891ccc0c68c35e17562c752955788f08054bcdb Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek at amd.com>
Date: Wed, 24 Jul 2024 13:22:28 -0500
Subject: [PATCH 056/259] [Frontend][OpenMP] Apply ompx_attribute to all
 allowing leaf constructs (#100370)

By default, in a compound directive, a clause will apply to the unique
leaf construct that allows it. Clauses that could apply to multiple leaf
constructs follow different rules.

For ompx_attribute, apply it to all leaf constructs that allow it.
---
 .../llvm/Frontend/OpenMP/ConstructDecompositionT.h     | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
index 3feb4bd11c998..349d862135d8c 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -236,6 +236,9 @@ struct ConstructDecompositionT {
                    const ClauseTy *);
   bool applyClause(const tomp::clause::NowaitT<TypeTy, IdTy, ExprTy> &clause,
                    const ClauseTy *);
+  bool
+  applyClause(const tomp::clause::OmpxAttributeT<TypeTy, IdTy, ExprTy> &clause,
+              const ClauseTy *);
 
   uint32_t version;
   llvm::omp::Directive construct;
@@ -1101,6 +1104,13 @@ bool ConstructDecompositionT<C, H>::applyClause(
   return applyToOutermost(node);
 }
 
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::OmpxAttributeT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  return applyToAll(node);
+}
+
 template <typename C, typename H> bool ConstructDecompositionT<C, H>::split() {
   bool success = true;
 

>From 0760aec54ca6f680f4786c4fc3bbae8f500deeab Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Wed, 24 Jul 2024 19:32:56 +0100
Subject: [PATCH 057/259] MTM: fix issues after cursory reading (#100404)

---
 llvm/lib/CodeGen/MachineTraceMetrics.cpp | 101 +++++++++++------------
 1 file changed, 46 insertions(+), 55 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index bf3add010574b..dd1faff355b52 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -24,17 +24,11 @@
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <iterator>
-#include <tuple>
-#include <utility>
 
 using namespace llvm;
 
@@ -133,7 +127,7 @@ MachineTraceMetrics::getResources(const MachineBasicBlock *MBB) {
 
   // Scale the resource cycles so they are comparable.
   unsigned PROffset = MBB->getNumber() * PRKinds;
-  for (unsigned K = 0; K != PRKinds; ++K)
+  for (unsigned K = 0; K < PRKinds; ++K)
     ProcReleaseAtCycles[PROffset + K] =
       PRCycles[K] * SchedModel.getResourceFactor(K);
 
@@ -146,15 +140,14 @@ MachineTraceMetrics::getProcReleaseAtCycles(unsigned MBBNum) const {
          "getResources() must be called before getProcReleaseAtCycles()");
   unsigned PRKinds = SchedModel.getNumProcResourceKinds();
   assert((MBBNum+1) * PRKinds <= ProcReleaseAtCycles.size());
-  return ArrayRef(ProcReleaseAtCycles.data() + MBBNum * PRKinds, PRKinds);
+  return ArrayRef{ProcReleaseAtCycles.data() + MBBNum * PRKinds, PRKinds};
 }
 
 //===----------------------------------------------------------------------===//
 //                         Ensemble utility functions
 //===----------------------------------------------------------------------===//
 
-MachineTraceMetrics::Ensemble::Ensemble(MachineTraceMetrics *ct)
-  : MTM(*ct) {
+MachineTraceMetrics::Ensemble::Ensemble(MachineTraceMetrics *CT) : MTM(*CT) {
   BlockInfo.resize(MTM.BlockInfo.size());
   unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
   ProcResourceDepths.resize(MTM.BlockInfo.size() * PRKinds);
@@ -198,7 +191,7 @@ computeDepthResources(const MachineBasicBlock *MBB) {
   // Compute per-resource depths.
   ArrayRef<unsigned> PredPRDepths = getProcResourceDepths(PredNum);
   ArrayRef<unsigned> PredPRCycles = MTM.getProcReleaseAtCycles(PredNum);
-  for (unsigned K = 0; K != PRKinds; ++K)
+  for (unsigned K = 0; K < PRKinds; ++K)
     ProcResourceDepths[PROffset + K] = PredPRDepths[K] + PredPRCycles[K];
 }
 
@@ -231,7 +224,7 @@ computeHeightResources(const MachineBasicBlock *MBB) {
 
   // Compute per-resource heights.
   ArrayRef<unsigned> SuccPRHeights = getProcResourceHeights(SuccNum);
-  for (unsigned K = 0; K != PRKinds; ++K)
+  for (unsigned K = 0; K < PRKinds; ++K)
     ProcResourceHeights[PROffset + K] = SuccPRHeights[K] + PRCycles[K];
 }
 
@@ -264,7 +257,7 @@ MachineTraceMetrics::Ensemble::
 getProcResourceDepths(unsigned MBBNum) const {
   unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
   assert((MBBNum+1) * PRKinds <= ProcResourceDepths.size());
-  return ArrayRef(ProcResourceDepths.data() + MBBNum * PRKinds, PRKinds);
+  return ArrayRef{ProcResourceDepths.data() + MBBNum * PRKinds, PRKinds};
 }
 
 /// Get an array of processor resource heights for MBB. Indexed by processor
@@ -277,7 +270,7 @@ MachineTraceMetrics::Ensemble::
 getProcResourceHeights(unsigned MBBNum) const {
   unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
   assert((MBBNum+1) * PRKinds <= ProcResourceHeights.size());
-  return ArrayRef(ProcResourceHeights.data() + MBBNum * PRKinds, PRKinds);
+  return ArrayRef{ProcResourceHeights.data() + MBBNum * PRKinds, PRKinds};
 }
 
 //===----------------------------------------------------------------------===//
@@ -314,8 +307,8 @@ class MinInstrCountEnsemble : public MachineTraceMetrics::Ensemble {
   const MachineBasicBlock *pickTraceSucc(const MachineBasicBlock*) override;
 
 public:
-  MinInstrCountEnsemble(MachineTraceMetrics *mtm)
-    : MachineTraceMetrics::Ensemble(mtm) {}
+  MinInstrCountEnsemble(MachineTraceMetrics *MTM)
+      : MachineTraceMetrics::Ensemble(MTM) {}
 };
 
 /// Pick only the current basic block for the trace and do not choose any
@@ -395,15 +388,15 @@ MinInstrCountEnsemble::pickTraceSucc(const MachineBasicBlock *MBB) {
 
 // Get an Ensemble sub-class for the requested trace strategy.
 MachineTraceMetrics::Ensemble *
-MachineTraceMetrics::getEnsemble(MachineTraceStrategy strategy) {
-  assert(strategy < MachineTraceStrategy::TS_NumStrategies &&
+MachineTraceMetrics::getEnsemble(MachineTraceStrategy Strategy) {
+  assert(Strategy < MachineTraceStrategy::TS_NumStrategies &&
          "Invalid trace strategy enum");
-  Ensemble *&E = Ensembles[static_cast<size_t>(strategy)];
+  Ensemble *&E = Ensembles[static_cast<size_t>(Strategy)];
   if (E)
     return E;
 
   // Allocate new Ensemble on demand.
-  switch (strategy) {
+  switch (Strategy) {
   case MachineTraceStrategy::TS_MinInstrCount:
     return (E = new MinInstrCountEnsemble(this));
   case MachineTraceStrategy::TS_Local:
@@ -448,8 +441,9 @@ struct LoopBounds {
   const MachineLoopInfo *Loops;
   bool Downward = false;
 
-  LoopBounds(MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> blocks,
-             const MachineLoopInfo *loops) : Blocks(blocks), Loops(loops) {}
+  LoopBounds(MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> Blocks,
+             const MachineLoopInfo *Loops)
+      : Blocks(Blocks), Loops(Loops) {}
 };
 
 } // end anonymous namespace
@@ -463,7 +457,7 @@ class po_iterator_storage<LoopBounds, true> {
   LoopBounds &LB;
 
 public:
-  po_iterator_storage(LoopBounds &lb) : LB(lb) {}
+  po_iterator_storage(LoopBounds &LB) : LB(LB) {}
 
   void finishPostorder(const MachineBasicBlock*) {}
 
@@ -546,7 +540,7 @@ MachineTraceMetrics::Ensemble::invalidate(const MachineBasicBlock *BadMBB) {
   if (BadTBI.hasValidHeight()) {
     BadTBI.invalidateHeight();
     WorkList.push_back(BadMBB);
-    do {
+    while (!WorkList.empty()) {
       const MachineBasicBlock *MBB = WorkList.pop_back_val();
       LLVM_DEBUG(dbgs() << "Invalidate " << printMBBReference(*MBB) << ' '
                         << getName() << " height.\n");
@@ -564,14 +558,14 @@ MachineTraceMetrics::Ensemble::invalidate(const MachineBasicBlock *BadMBB) {
         // Verify that TBI.Succ is actually a *I successor.
         assert((!TBI.Succ || Pred->isSuccessor(TBI.Succ)) && "CFG changed");
       }
-    } while (!WorkList.empty());
+    }
   }
 
   // Invalidate depth resources of blocks below MBB.
   if (BadTBI.hasValidDepth()) {
     BadTBI.invalidateDepth();
     WorkList.push_back(BadMBB);
-    do {
+    while (!WorkList.empty()) {
       const MachineBasicBlock *MBB = WorkList.pop_back_val();
       LLVM_DEBUG(dbgs() << "Invalidate " << printMBBReference(*MBB) << ' '
                         << getName() << " depth.\n");
@@ -589,7 +583,7 @@ MachineTraceMetrics::Ensemble::invalidate(const MachineBasicBlock *BadMBB) {
         // Verify that TBI.Pred is actually a *I predecessor.
         assert((!TBI.Pred || Succ->isPredecessor(TBI.Pred)) && "CFG changed");
       }
-    } while (!WorkList.empty());
+    }
   }
 
   // Clear any per-instruction data. We only have to do this for BadMBB itself
@@ -605,7 +599,7 @@ void MachineTraceMetrics::Ensemble::verify() const {
 #ifndef NDEBUG
   assert(BlockInfo.size() == MTM.MF->getNumBlockIDs() &&
          "Outdated BlockInfo size");
-  for (unsigned Num = 0, e = BlockInfo.size(); Num != e; ++Num) {
+  for (unsigned Num = 0; Num < BlockInfo.size(); ++Num) {
     const TraceBlockInfo &TBI = BlockInfo[Num];
     if (TBI.hasValidDepth() && TBI.Pred) {
       const MachineBasicBlock *MBB = MTM.MF->getBlockNumbered(Num);
@@ -686,7 +680,7 @@ static bool getDataDeps(const MachineInstr &UseMI,
     }
     // Collect virtual register reads.
     if (MO.readsReg())
-      Deps.push_back(DataDep(MRI, Reg, MO.getOperandNo()));
+      Deps.emplace_back(MRI, Reg, MO.getOperandNo());
   }
   return HasPhysRegs;
 }
@@ -702,10 +696,10 @@ static void getPHIDeps(const MachineInstr &UseMI,
   if (!Pred)
     return;
   assert(UseMI.isPHI() && UseMI.getNumOperands() % 2 && "Bad PHI");
-  for (unsigned i = 1; i != UseMI.getNumOperands(); i += 2) {
-    if (UseMI.getOperand(i + 1).getMBB() == Pred) {
-      Register Reg = UseMI.getOperand(i).getReg();
-      Deps.push_back(DataDep(MRI, Reg, i));
+  for (unsigned Idx = 1; Idx < UseMI.getNumOperands(); Idx += 2) {
+    if (UseMI.getOperand(Idx + 1).getMBB() == Pred) {
+      Register Reg = UseMI.getOperand(Idx).getReg();
+      Deps.emplace_back(MRI, Reg, Idx);
       return;
     }
   }
@@ -739,7 +733,7 @@ static void updatePhysDepsDownwards(const MachineInstr *UseMI,
       SparseSet<LiveRegUnit>::iterator I = RegUnits.find(Unit);
       if (I == RegUnits.end())
         continue;
-      Deps.push_back(DataDep(I->MI, I->Op, MO.getOperandNo()));
+      Deps.emplace_back(I->MI, I->Op, MO.getOperandNo());
       break;
     }
   }
@@ -852,14 +846,14 @@ computeInstrDepths(const MachineBasicBlock *MBB) {
   // implies Head->HasValidInstrDepths, so we only need to start from the first
   // block in the trace that needs to be recomputed.
   SmallVector<const MachineBasicBlock*, 8> Stack;
-  do {
+  while (MBB) {
     TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
     assert(TBI.hasValidDepth() && "Incomplete trace");
     if (TBI.HasValidInstrDepths)
       break;
     Stack.push_back(MBB);
     MBB = TBI.Pred;
-  } while (MBB);
+  }
 
   // FIXME: If MBB is non-null at this point, it is the last pre-computed block
   // in the trace. We should track any live-out physregs that were defined in
@@ -880,7 +874,7 @@ computeInstrDepths(const MachineBasicBlock *MBB) {
     LLVM_DEBUG({
       dbgs() << format("%7u Instructions\n", TBI.InstrDepth);
       ArrayRef<unsigned> PRDepths = getProcResourceDepths(MBB->getNumber());
-      for (unsigned K = 0; K != PRDepths.size(); ++K)
+      for (unsigned K = 0; K < PRDepths.size(); ++K)
         if (PRDepths[K]) {
           unsigned Factor = MTM.SchedModel.getResourceFactor(K);
           dbgs() << format("%6uc @ ", MTM.getCycles(PRDepths[K]))
@@ -969,10 +963,8 @@ static bool pushDepHeight(const DataDep &Dep, const MachineInstr &UseMI,
                                                   Dep.UseOp);
 
   // Update Heights[DefMI] to be the maximum height seen.
-  MIHeightMap::iterator I;
-  bool New;
-  std::tie(I, New) = Heights.insert(std::make_pair(Dep.DefMI, UseHeight));
-  if (New)
+  const auto &[I, Inserted] = Heights.insert({Dep.DefMI, UseHeight});
+  if (Inserted)
     return true;
 
   // DefMI has been pushed before. Give it the max height.
@@ -1010,7 +1002,7 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
   // The bottom of the trace may already be computed.
   // Find the blocks that need updating.
   SmallVector<const MachineBasicBlock*, 8> Stack;
-  do {
+  while (MBB) {
     TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
     assert(TBI.hasValidHeight() && "Incomplete trace");
     if (TBI.HasValidInstrHeights)
@@ -1018,7 +1010,7 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
     Stack.push_back(MBB);
     TBI.LiveIns.clear();
     MBB = TBI.Succ;
-  } while (MBB);
+  }
 
   // As we move upwards in the trace, keep track of instructions that are
   // required by deeper trace instructions. Map MI -> height required so far.
@@ -1060,7 +1052,7 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
     LLVM_DEBUG({
       dbgs() << format("%7u Instructions\n", TBI.InstrHeight);
       ArrayRef<unsigned> PRHeights = getProcResourceHeights(MBB->getNumber());
-      for (unsigned K = 0; K != PRHeights.size(); ++K)
+      for (unsigned K = 0; K < PRHeights.size(); ++K)
         if (PRHeights[K]) {
           unsigned Factor = MTM.SchedModel.getResourceFactor(K);
           dbgs() << format("%6uc @ ", MTM.getCycles(PRHeights[K]))
@@ -1145,7 +1137,7 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
 
     // Transfer the live regunits to the live-in list.
     for (const LiveRegUnit &RU : RegUnits) {
-      TBI.LiveIns.push_back(LiveInReg(RU.RegUnit, RU.Cycle));
+      TBI.LiveIns.emplace_back(RU.RegUnit, RU.Cycle);
       LLVM_DEBUG(dbgs() << ' ' << printRegUnit(RU.RegUnit, MTM.TRI) << '@'
                         << RU.Cycle);
     }
@@ -1205,7 +1197,7 @@ unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const {
   ArrayRef<unsigned> PRDepths = TE.getProcResourceDepths(getBlockNum());
   if (Bottom) {
     ArrayRef<unsigned> PRCycles = TE.MTM.getProcReleaseAtCycles(getBlockNum());
-    for (unsigned K = 0; K != PRDepths.size(); ++K)
+    for (unsigned K = 0; K < PRDepths.size(); ++K)
       PRMax = std::max(PRMax, PRDepths[K] + PRCycles[K]);
   } else {
     for (unsigned PRD : PRDepths)
@@ -1235,9 +1227,8 @@ unsigned MachineTraceMetrics::Trace::getResourceLength(
   unsigned PRMax = 0;
 
   // Capture computing cycles from extra instructions
-  auto extraCycles = [this](ArrayRef<const MCSchedClassDesc *> Instrs,
-                            unsigned ResourceIdx)
-                         ->unsigned {
+  auto ExtraCycles = [this](ArrayRef<const MCSchedClassDesc *> Instrs,
+                            unsigned ResourceIdx) -> unsigned {
     unsigned Cycles = 0;
     for (const MCSchedClassDesc *SC : Instrs) {
       if (!SC->isValid())
@@ -1255,12 +1246,12 @@ unsigned MachineTraceMetrics::Trace::getResourceLength(
     return Cycles;
   };
 
-  for (unsigned K = 0; K != PRDepths.size(); ++K) {
+  for (unsigned K = 0; K < PRDepths.size(); ++K) {
     unsigned PRCycles = PRDepths[K] + PRHeights[K];
     for (const MachineBasicBlock *MBB : Extrablocks)
       PRCycles += TE.MTM.getProcReleaseAtCycles(MBB->getNumber())[K];
-    PRCycles += extraCycles(ExtraInstrs, K);
-    PRCycles -= extraCycles(RemoveInstrs, K);
+    PRCycles += ExtraCycles(ExtraInstrs, K);
+    PRCycles -= ExtraCycles(RemoveInstrs, K);
     PRMax = std::max(PRMax, PRCycles);
   }
   // Convert to cycle count.
@@ -1292,9 +1283,9 @@ bool MachineTraceMetrics::Trace::isDepInTrace(const MachineInstr &DefMI,
 
 void MachineTraceMetrics::Ensemble::print(raw_ostream &OS) const {
   OS << getName() << " ensemble:\n";
-  for (unsigned i = 0, e = BlockInfo.size(); i != e; ++i) {
-    OS << "  %bb." << i << '\t';
-    BlockInfo[i].print(OS);
+  for (unsigned Idx = 0; Idx < BlockInfo.size(); ++Idx) {
+    OS << "  %bb." << Idx << '\t';
+    BlockInfo[Idx].print(OS);
     OS << '\n';
   }
 }

>From 3db78fa1dd639ba5db761023e04f97418cfcd00e Mon Sep 17 00:00:00 2001
From: gonzalobg <65027571+gonzalobg at users.noreply.github.com>
Date: Wed, 24 Jul 2024 20:33:43 +0200
Subject: [PATCH 058/259] [NVPTX] Add Volta Load/Store Atomics (.relaxed,
 .acquire, .release) and Volatile (.mmio/.volatile) support  (#99709)

Followup to https://github.com/llvm/llvm-project/pull/98022 which broke
support for LLVM `unordered` atomic ordering.

This PR lowers `atomic unordered` to PTX volatile operations to preserve
atomicity, and adds a bunch of tests for this ordering to the NVPTX
backend (we had none).
---
 .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp   |   28 +-
 llvm/lib/Target/NVPTX/NVPTX.h                 |    8 +
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp   |  318 +++-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       |  144 +-
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h        |    7 +-
 llvm/test/CodeGen/NVPTX/load-store-sm-70.ll   | 1279 +++++++++++++++++
 llvm/test/CodeGen/NVPTX/load-store.ll         |  881 +++++++++++-
 7 files changed, 2508 insertions(+), 157 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/load-store-sm-70.ll

diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 380d878c1f532..a004d64c21cc6 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -227,9 +227,33 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
   if (Modifier) {
     const MCOperand &MO = MI->getOperand(OpNum);
     int Imm = (int) MO.getImm();
-    if (!strcmp(Modifier, "volatile")) {
-      if (Imm)
+    if (!strcmp(Modifier, "sem")) {
+      switch (Imm) {
+      case NVPTX::PTXLdStInstCode::NotAtomic:
+        break;
+      case NVPTX::PTXLdStInstCode::Volatile:
         O << ".volatile";
+        break;
+      case NVPTX::PTXLdStInstCode::Relaxed:
+        O << ".relaxed.sys";
+        break;
+      case NVPTX::PTXLdStInstCode::Acquire:
+        O << ".acquire.sys";
+        break;
+      case NVPTX::PTXLdStInstCode::Release:
+        O << ".release.sys";
+        break;
+      case NVPTX::PTXLdStInstCode::RelaxedMMIO:
+        O << ".mmio.relaxed.sys";
+        break;
+      default:
+        SmallString<256> Msg;
+        raw_svector_ostream OS(Msg);
+        OS << "NVPTX LdStCode Printer does not support \"" << Imm
+           << "\" sem modifier.";
+        report_fatal_error(OS.str());
+        break;
+      }
     } else if (!strcmp(Modifier, "addsp")) {
       switch (Imm) {
       case NVPTX::PTXLdStInstCode::GLOBAL:
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index b0cb24c63c3ce..3c7167b157025 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -107,6 +107,14 @@ enum LoadStore {
 };
 
 namespace PTXLdStInstCode {
+enum MemorySemantic {
+  NotAtomic = 0, // PTX calls these: "Weak"
+  Volatile = 1,
+  Relaxed = 2,
+  Acquire = 3,
+  Release = 4,
+  RelaxedMMIO = 5
+};
 enum AddressSpace {
   GENERIC = 0,
   GLOBAL = 1,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 8516bc1bef83e..96456ad0547ea 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -714,6 +714,174 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
   return NVPTX::PTXLdStInstCode::GENERIC;
 }
 
+static unsigned int getCodeMemorySemantic(MemSDNode *N,
+                                          const NVPTXSubtarget *Subtarget) {
+  AtomicOrdering Ordering = N->getSuccessOrdering();
+  auto CodeAddrSpace = getCodeAddrSpace(N);
+
+  bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
+  bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
+
+  // TODO: lowering for SequentiallyConsistent Operations: for now, we error.
+  // TODO: lowering for AcquireRelease Operations: for now, we error.
+  //
+
+  // clang-format off
+
+  // Lowering for non-SequentiallyConsistent Operations
+  //
+  // | Atomic  | Volatile | Statespace         | PTX sm_60- | PTX sm_70+                   |
+  // |---------|----------|--------------------|------------|------------------------------|
+  // | No      | No       | All                | plain      | .weak                        |
+  // | No      | Yes      | Generic,Shared,    | .volatile  | .volatile                    |
+  // |         |          | Global [0]         |            |                              |
+  // | No      | Yes      | Local,Const,Param  | plain [1]  | .weak [1]                    |
+  // | Unorder | Yes/No   | All                | == Relaxed | == Relaxed                   |
+  // | Relaxed | No       | Generic,Shared,    | .volatile  | <atomic sem>                 |
+  // |         |          | Global [0]         |            |                              |
+  // | Other   | No       | Generic,Shared,    | Error [2]  | <atomic sem>                 |
+  // |         |          | Global [0]         |            |                              |
+  // | Yes     | No       | Local,Const,Param  | plain [1]  | .weak [1]                    |
+  // | Relaxed | Yes      | Generic,Shared [0] | .volatile  | .volatile                    |
+  // | Relaxed | Yes      | Global [0]         | .volatile  | .mmio.relaxed.sys (PTX 8.2+) |
+  // |         |          |                    |            |  or .volatile (PTX 8.1-)     |
+  // | Relaxed | Yes      | Local,Const,Param  | plain [1]  | .weak [1]                    |
+  // | Other   | Yes      | Generic, Shared,   | Error [2]  | <atomic sem> [3]             |
+  // |         |          | / Global [0]       |            |                              |
+
+  // clang-format on
+
+  // [0]: volatile and atomics are only supported on global or shared
+  //      memory locations, accessed via generic/shared/global pointers.
+  //      MMIO is only supported on global memory locations,
+  //      accessed via generic/global pointers.
+  // TODO: Implement MMIO access via generic pointer to global.
+  //       Currently implemented for global pointers only.
+
+  // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic
+  //      PTX instructions fails to preserve their C++ side-effects.
+  //
+  //      Example (https://github.com/llvm/llvm-project/issues/62057):
+  //
+  //          void example() {
+  //              std::atomic<bool> True = true;
+  //              while (True.load(std::memory_order_relaxed));
+  //          }
+  //
+  //      A C++ program that calls "example" is well-defined: the infinite loop
+  //      performs an atomic operation. By lowering volatile/atomics to
+  //      "weak" memory operations, we are transforming the above into:
+  //
+  //          void undefined_behavior() {
+  //              bool True = true;
+  //              while (True);
+  //          }
+  //
+  //      which exhibits undefined behavior in both C++ and PTX.
+  //
+  //      Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
+  //      behavior due to lack of Independent Forward Progress. Lowering these
+  //      to weak memory operations in sm_60- is therefore fine.
+  //
+  //      TODO: lower atomic and volatile operations to memory locations
+  //      in local, const, and param to two PTX instructions in sm_70+:
+  //        - the "weak" memory instruction we are currently lowering to, and
+  //        - some other instruction that preserves the side-effect, e.g.,
+  //          a dead dummy volatile load.
+
+  if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
+      CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
+      CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) {
+    return NVPTX::PTXLdStInstCode::NotAtomic;
+  }
+
+  // [2]: Atomics with Ordering different than Unordered or Relaxed are not
+  //      supported on sm_60 and older; this includes volatile atomics.
+  if (!(Ordering == AtomicOrdering::NotAtomic ||
+        Ordering == AtomicOrdering::Unordered ||
+        Ordering == AtomicOrdering::Monotonic) &&
+      !HasMemoryOrdering) {
+    SmallString<256> Msg;
+    raw_svector_ostream OS(Msg);
+    OS << "PTX does not support \"atomic\" for orderings different than"
+          "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order is: \""
+       << toIRString(Ordering) << "\".";
+    report_fatal_error(OS.str());
+  }
+
+  // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
+  // the volatile semantics and preserve the atomic ones.
+
+  // PTX volatile and PTX atomics are not available for statespace that differ
+  // from .generic, .global, or .shared. The behavior of PTX volatile and PTX
+  // atomics is undefined if the generic address does not refer to a .global or
+  // .shared memory location.
+  bool AddrGenericOrGlobalOrShared =
+      (CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC ||
+       CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL ||
+       CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED);
+  bool UseRelaxedMMIO =
+      HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL;
+
+  switch (Ordering) {
+  case AtomicOrdering::NotAtomic:
+    return N->isVolatile() && AddrGenericOrGlobalOrShared
+               ? NVPTX::PTXLdStInstCode::Volatile
+               : NVPTX::PTXLdStInstCode::NotAtomic;
+  case AtomicOrdering::Unordered:
+    // We lower unordered in the exact same way as 'monotonic' to respect
+    // LLVM IR atomicity requirements.
+  case AtomicOrdering::Monotonic:
+    if (N->isVolatile())
+      return UseRelaxedMMIO                ? NVPTX::PTXLdStInstCode::RelaxedMMIO
+             : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
+                                           : NVPTX::PTXLdStInstCode::NotAtomic;
+    else
+      return HasMemoryOrdering             ? NVPTX::PTXLdStInstCode::Relaxed
+             : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
+                                           : NVPTX::PTXLdStInstCode::NotAtomic;
+  case AtomicOrdering::Acquire:
+    if (!N->readMem()) {
+      SmallString<256> Msg;
+      raw_svector_ostream OS(Msg);
+      OS << "PTX only supports Acquire Ordering on reads: "
+         << N->getOperationName();
+      N->print(OS);
+      report_fatal_error(OS.str());
+    }
+    return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Acquire
+                                       : NVPTX::PTXLdStInstCode::NotAtomic;
+  case AtomicOrdering::Release:
+    if (!N->writeMem()) {
+      SmallString<256> Msg;
+      raw_svector_ostream OS(Msg);
+      OS << "PTX only supports Release Ordering on writes: "
+         << N->getOperationName();
+      N->print(OS);
+      report_fatal_error(OS.str());
+    }
+    return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Release
+                                       : NVPTX::PTXLdStInstCode::NotAtomic;
+  case AtomicOrdering::AcquireRelease: {
+    SmallString<256> Msg;
+    raw_svector_ostream OS(Msg);
+    OS << "PTX only supports AcquireRelease Ordering on read-modify-write: "
+       << N->getOperationName();
+    N->print(OS);
+    report_fatal_error(OS.str());
+  }
+  case AtomicOrdering::SequentiallyConsistent:
+    // TODO: support AcquireRelease and SequentiallyConsistent
+    SmallString<256> Msg;
+    raw_svector_ostream OS(Msg);
+    OS << "NVPTX backend does not support AtomicOrdering \""
+       << toIRString(Ordering) << "\" yet.";
+    report_fatal_error(OS.str());
+  }
+
+  llvm_unreachable("unexpected unhandled case");
+}
+
 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
                           unsigned CodeAddrSpace, MachineFunction *F) {
   // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
@@ -916,32 +1084,18 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   if (!LoadedVT.isSimple())
     return false;
 
-  AtomicOrdering Ordering = LD->getSuccessOrdering();
-  // In order to lower atomic loads with stronger guarantees we would need to
-  // use load.acquire or insert fences. However these features were only added
-  // with PTX ISA 6.0 / sm_70.
-  // TODO: Check if we can actually use the new instructions and implement them.
-  if (isStrongerThanMonotonic(Ordering))
-    return false;
-
   // Address Space Setting
   unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
   if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
     return tryLDGLDU(N);
   }
 
+  // Memory Semantic Setting
+  unsigned int CodeMemorySem = getCodeMemorySemantic(LD, Subtarget);
+
   unsigned int PointerSize =
       CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
 
-  // Volatile Setting
-  // - .volatile is only available for .global and .shared
-  // - .volatile has the same memory synchronization semantics as .relaxed.sys
-  bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
-  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
-      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
-      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
-    isVolatile = false;
-
   // Type Setting: fromType + fromTypeWidth
   //
   // Sign   : ISD::SEXTLOAD
@@ -982,9 +1136,13 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
                              NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
-                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
-                      getI32Imm(fromTypeWidth, dl), Addr, Chain };
+    SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+                     getI32Imm(CodeAddrSpace, dl),
+                     getI32Imm(vecType, dl),
+                     getI32Imm(fromType, dl),
+                     getI32Imm(fromTypeWidth, dl),
+                     Addr,
+                     Chain};
     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
                                : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
@@ -993,9 +1151,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
                              NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
-                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
-                      getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
+    SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+                     getI32Imm(CodeAddrSpace, dl),
+                     getI32Imm(vecType, dl),
+                     getI32Imm(fromType, dl),
+                     getI32Imm(fromTypeWidth, dl),
+                     Base,
+                     Offset,
+                     Chain};
     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
                                : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
@@ -1010,9 +1173,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
                                NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
-                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
-                      getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
+    SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+                     getI32Imm(CodeAddrSpace, dl),
+                     getI32Imm(vecType, dl),
+                     getI32Imm(fromType, dl),
+                     getI32Imm(fromTypeWidth, dl),
+                     Base,
+                     Offset,
+                     Chain};
     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   } else {
     if (PointerSize == 64)
@@ -1026,9 +1194,13 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
                                NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
-                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
-                      getI32Imm(fromTypeWidth, dl), N1, Chain };
+    SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+                     getI32Imm(CodeAddrSpace, dl),
+                     getI32Imm(vecType, dl),
+                     getI32Imm(fromType, dl),
+                     getI32Imm(fromTypeWidth, dl),
+                     N1,
+                     Chain};
     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   }
 
@@ -1065,13 +1237,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   unsigned int PointerSize =
       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
 
-  // Volatile Setting
-  // - .volatile is only availalble for .global and .shared
-  bool IsVolatile = MemSD->isVolatile();
-  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
-      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
-      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
-    IsVolatile = false;
+  // Memory Semantic Setting
+  unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
 
   // Vector Setting
   MVT SimpleVT = LoadedVT.getSimpleVT();
@@ -1138,9 +1305,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
     }
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
-                      getI32Imm(VecType, DL), getI32Imm(FromType, DL),
-                      getI32Imm(FromTypeWidth, DL), Addr, Chain };
+    SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+                     getI32Imm(CodeAddrSpace, DL),
+                     getI32Imm(VecType, DL),
+                     getI32Imm(FromType, DL),
+                     getI32Imm(FromTypeWidth, DL),
+                     Addr,
+                     Chain};
     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
   } else if (PointerSize == 64
                  ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
@@ -1163,9 +1334,14 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
     }
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
-                      getI32Imm(VecType, DL), getI32Imm(FromType, DL),
-                      getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
+    SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+                     getI32Imm(CodeAddrSpace, DL),
+                     getI32Imm(VecType, DL),
+                     getI32Imm(FromType, DL),
+                     getI32Imm(FromTypeWidth, DL),
+                     Base,
+                     Offset,
+                     Chain};
     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
   } else if (PointerSize == 64
                  ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
@@ -1208,9 +1384,14 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
     }
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
-                      getI32Imm(VecType, DL), getI32Imm(FromType, DL),
-                      getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
+    SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+                     getI32Imm(CodeAddrSpace, DL),
+                     getI32Imm(VecType, DL),
+                     getI32Imm(FromType, DL),
+                     getI32Imm(FromTypeWidth, DL),
+                     Base,
+                     Offset,
+                     Chain};
 
     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
   } else {
@@ -1253,9 +1434,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
     }
     if (!Opcode)
       return false;
-    SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
-                      getI32Imm(VecType, DL), getI32Imm(FromType, DL),
-                      getI32Imm(FromTypeWidth, DL), Op1, Chain };
+    SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+                     getI32Imm(CodeAddrSpace, DL),
+                     getI32Imm(VecType, DL),
+                     getI32Imm(FromType, DL),
+                     getI32Imm(FromTypeWidth, DL),
+                     Op1,
+                     Chain};
     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
   }
 
@@ -1698,27 +1883,13 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   if (!StoreVT.isSimple())
     return false;
 
-  AtomicOrdering Ordering = ST->getSuccessOrdering();
-  // In order to lower atomic loads with stronger guarantees we would need to
-  // use store.release or insert fences. However these features were only added
-  // with PTX ISA 6.0 / sm_70.
-  // TODO: Check if we can actually use the new instructions and implement them.
-  if (isStrongerThanMonotonic(Ordering))
-    return false;
-
   // Address Space Setting
   unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
   unsigned int PointerSize =
       CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
 
-  // Volatile Setting
-  // - .volatile is only available for .global and .shared
-  // - .volatile has the same memory synchronization semantics as .relaxed.sys
-  bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
-  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
-      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
-      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
-    isVolatile = false;
+  // Memory Semantic Setting
+  unsigned int CodeMemorySem = getCodeMemorySemantic(ST, Subtarget);
 
   // Vector Setting
   MVT SimpleVT = StoreVT.getSimpleVT();
@@ -1755,7 +1926,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
     if (!Opcode)
       return false;
     SDValue Ops[] = {Value,
-                     getI32Imm(isVolatile, dl),
+                     getI32Imm(CodeMemorySem, dl),
                      getI32Imm(CodeAddrSpace, dl),
                      getI32Imm(vecType, dl),
                      getI32Imm(toType, dl),
@@ -1772,7 +1943,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
     if (!Opcode)
       return false;
     SDValue Ops[] = {Value,
-                     getI32Imm(isVolatile, dl),
+                     getI32Imm(CodeMemorySem, dl),
                      getI32Imm(CodeAddrSpace, dl),
                      getI32Imm(vecType, dl),
                      getI32Imm(toType, dl),
@@ -1797,7 +1968,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
       return false;
 
     SDValue Ops[] = {Value,
-                     getI32Imm(isVolatile, dl),
+                     getI32Imm(CodeMemorySem, dl),
                      getI32Imm(CodeAddrSpace, dl),
                      getI32Imm(vecType, dl),
                      getI32Imm(toType, dl),
@@ -1819,7 +1990,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
     if (!Opcode)
       return false;
     SDValue Ops[] = {Value,
-                     getI32Imm(isVolatile, dl),
+                     getI32Imm(CodeMemorySem, dl),
                      getI32Imm(CodeAddrSpace, dl),
                      getI32Imm(vecType, dl),
                      getI32Imm(toType, dl),
@@ -1858,13 +2029,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
   unsigned int PointerSize =
       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
 
-  // Volatile Setting
-  // - .volatile is only availalble for .global and .shared
-  bool IsVolatile = MemSD->isVolatile();
-  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
-      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
-      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
-    IsVolatile = false;
+  // Memory Semantic Setting
+  unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
 
   // Type Setting: toType + toTypeWidth
   // - for integer type, always use 'u'
@@ -1906,7 +2072,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
     ToTypeWidth = 32;
   }
 
-  StOps.push_back(getI32Imm(IsVolatile, DL));
+  StOps.push_back(getI32Imm(CodeMemorySem, DL));
   StOps.push_back(getI32Imm(CodeAddrSpace, DL));
   StOps.push_back(getI32Imm(VecType, DL));
   StOps.push_back(getI32Imm(ToType, DL));
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index f82013b63dc6a..91b239a52d17f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2958,39 +2958,39 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
 multiclass LD<NVPTXRegClass regclass> {
   def _avar : NVPTXInst<
     (outs regclass:$dst),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, imem:$addr),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t$dst, [$addr];", []>;
   def _areg : NVPTXInst<
     (outs regclass:$dst),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int32Regs:$addr),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t$dst, [$addr];", []>;
   def _areg_64 : NVPTXInst<
     (outs regclass:$dst),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int64Regs:$addr),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t$dst, [$addr];", []>;
   def _ari : NVPTXInst<
     (outs regclass:$dst),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t$dst, [$addr+$offset];", []>;
   def _ari_64 : NVPTXInst<
     (outs regclass:$dst),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
          LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t$dst, [$addr+$offset];", []>;
   def _asi : NVPTXInst<
     (outs regclass:$dst),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
          LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t$dst, [$addr+$offset];", []>;
 }
 
@@ -3006,39 +3006,39 @@ let mayLoad=1, hasSideEffects=0 in {
 multiclass ST<NVPTXRegClass regclass> {
   def _avar : NVPTXInst<
     (outs),
-    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+    (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
          LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
     " \t[$addr], $src;", []>;
   def _areg : NVPTXInst<
     (outs),
-    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
+    (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp,
          LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
     " \t[$addr], $src;", []>;
   def _areg_64 : NVPTXInst<
     (outs),
-    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+    (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
          LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
     " \t[$addr], $src;", []>;
   def _ari : NVPTXInst<
     (outs),
-    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+    (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
          LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
     " \t[$addr+$offset], $src;", []>;
   def _ari_64 : NVPTXInst<
     (outs),
-    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+    (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
          LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
     " \t[$addr+$offset], $src;", []>;
   def _asi : NVPTXInst<
     (outs),
-    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+    (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
          LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
     " \t[$addr+$offset], $src;", []>;
 }
 
@@ -3057,75 +3057,75 @@ let mayStore=1, hasSideEffects=0 in {
 multiclass LD_VEC<NVPTXRegClass regclass> {
   def _v2_avar : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, imem:$addr),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2}}, [$addr];", []>;
   def _v2_areg : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int32Regs:$addr),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2}}, [$addr];", []>;
   def _v2_areg_64 : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int64Regs:$addr),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2}}, [$addr];", []>;
   def _v2_ari : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
   def _v2_ari_64 : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
   def _v2_asi : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, imem:$addr, i32imm:$offset),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
   def _v4_avar : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, imem:$addr),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
   def _v4_areg : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int32Regs:$addr),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
   def _v4_areg_64 : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int64Regs:$addr),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
   def _v4_ari : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
   def _v4_ari_64 : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
   def _v4_asi : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, imem:$addr, i32imm:$offset),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
 }
 let mayLoad=1, hasSideEffects=0 in {
@@ -3140,84 +3140,84 @@ let mayLoad=1, hasSideEffects=0 in {
 multiclass ST_VEC<NVPTXRegClass regclass> {
   def _v2_avar : NVPTXInst<
     (outs),
-    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+    (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr], {{$src1, $src2}};", []>;
   def _v2_areg : NVPTXInst<
     (outs),
-    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+    (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr], {{$src1, $src2}};", []>;
   def _v2_areg_64 : NVPTXInst<
     (outs),
-    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+    (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr], {{$src1, $src2}};", []>;
   def _v2_ari : NVPTXInst<
     (outs),
-    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+    (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
          i32imm:$offset),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr+$offset], {{$src1, $src2}};", []>;
   def _v2_ari_64 : NVPTXInst<
     (outs),
-    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+    (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
          i32imm:$offset),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr+$offset], {{$src1, $src2}};", []>;
   def _v2_asi : NVPTXInst<
     (outs),
-    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+    (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
          i32imm:$offset),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr+$offset], {{$src1, $src2}};", []>;
   def _v4_avar : NVPTXInst<
     (outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, imem:$addr),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
   def _v4_areg : NVPTXInst<
     (outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int32Regs:$addr),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
   def _v4_areg_64 : NVPTXInst<
     (outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int64Regs:$addr),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
   def _v4_ari : NVPTXInst<
     (outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
   def _v4_ari_64 : NVPTXInst<
     (outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
   def _v4_asi : NVPTXInst<
     (outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, imem:$addr, i32imm:$offset),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
+    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}"
     "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
 }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 3ca4c1a24c79a..8df41913ff12e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -78,13 +78,18 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   bool hasAtomBitwise64() const { return SmVersion >= 32; }
   bool hasAtomMinMax64() const { return SmVersion >= 32; }
   bool hasLDG() const { return SmVersion >= 32; }
-  inline bool hasHWROT32() const { return SmVersion >= 32; }
+  bool hasHWROT32() const { return SmVersion >= 32; }
   bool hasImageHandles() const;
   bool hasFP16Math() const { return SmVersion >= 53; }
   bool hasBF16Math() const { return SmVersion >= 80; }
   bool allowFP16Math() const;
   bool hasMaskOperator() const { return PTXVersion >= 71; }
   bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
+  // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
+  // release, acq_rel, sc) ?
+  bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
+  // Does SM & PTX support atomic relaxed MMIO operations ?
+  bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
   unsigned int getFullSmVersion() const { return FullSmVersion; }
   unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
   // GPUs with "a" suffix have include architecture-accelerated features that
diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
new file mode 100644
index 0000000000000..68915b0f2698b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
@@ -0,0 +1,1279 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s
+; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %}
+
+; CHECK-LABEL: generic_plain
+define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
+  ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load i8, ptr %a
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store i8 %a.add, ptr %a
+
+  ; CHECK: ld.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load i16, ptr %b
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store i16 %b.add, ptr %b
+
+  ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load i32, ptr %c
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store i32 %c.add, ptr %c
+
+  ; CHECK: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load i64, ptr %d
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store i64 %d.add, ptr %d
+
+  ; CHECK: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load float, ptr %c
+  %e.add = fadd float %e.load, 1.
+  ; CHECK: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store float %e.add, ptr %c
+
+  ; CHECK: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load double, ptr %c
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store double %f.add, ptr %c
+
+  ret void
+}
+
+; CHECK-LABEL: generic_volatile
+define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
+  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load volatile i8, ptr %a
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store volatile i8 %a.add, ptr %a
+
+  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load volatile i16, ptr %b
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store volatile i16 %b.add, ptr %b
+
+  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load volatile i32, ptr %c
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store volatile i32 %c.add, ptr %c
+
+  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load volatile i64, ptr %d
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store volatile i64 %d.add, ptr %d
+
+  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load volatile float, ptr %c
+  %e.add = fadd float %e.load, 1.
+  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store volatile float %e.add, ptr %c
+
+  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load volatile double, ptr %c
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store volatile double %f.add, ptr %c
+
+  ret void
+}
+
+; CHECK-LABEL: generic_unordered
+define void @generic_unordered(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+  ; CHECK: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr %a unordered, align 1
+
+  ; CHECK: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr %b unordered, align 2
+
+  ; CHECK: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr %c unordered, align 4
+
+  ; CHECK: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr %d unordered, align 8
+
+  ; CHECK: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr %e unordered, align 4
+
+  ; CHECK: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr %e unordered, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: generic_monotonic
+define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+  ; CHECK: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr %a monotonic, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr %a monotonic, align 1
+
+  ; CHECK: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr %b monotonic, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr %b monotonic, align 2
+
+  ; CHECK: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr %c monotonic, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr %c monotonic, align 4
+
+  ; CHECK: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr %d monotonic, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr %d monotonic, align 8
+
+  ; CHECK: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr %e monotonic, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr %e monotonic, align 4
+
+  ; CHECK: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr %e monotonic, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr %e monotonic, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: generic_acq_rel
+define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+  ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr %a acquire, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr %a release, align 1
+
+  ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr %b acquire, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr %b release, align 2
+
+  ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr %c acquire, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr %c release, align 4
+
+  ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr %d acquire, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr %d release, align 8
+
+  ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr %e acquire, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr %e release, align 4
+
+  ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr %e acquire, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr %e release, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: generic_unordered_volatile
+define void @generic_unordered_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr %a unordered, align 1
+
+  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr %b unordered, align 2
+
+  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr %c unordered, align 4
+
+  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr %d unordered, align 8
+
+  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr %e unordered, align 4
+
+  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr %e unordered, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: generic_monotonic_volatile
+define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr %a monotonic, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr %a monotonic, align 1
+
+  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr %b monotonic, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr %b monotonic, align 2
+
+  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr %c monotonic, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr %c monotonic, align 4
+
+  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr %d monotonic, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr %d monotonic, align 8
+
+  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr %e monotonic, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr %e monotonic, align 4
+
+  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr %e monotonic, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr %e monotonic, align 8
+
+  ret void
+}
+
+;; global statespace
+
+; CHECK-LABEL: global_plain
+define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
+  ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load i8, ptr addrspace(1) %a
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store i8 %a.add, ptr addrspace(1) %a
+
+  ; CHECK: ld.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load i16, ptr addrspace(1) %b
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store i16 %b.add, ptr addrspace(1) %b
+
+  ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load i32, ptr addrspace(1) %c
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store i32 %c.add, ptr addrspace(1) %c
+
+  ; CHECK: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load i64, ptr addrspace(1) %d
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store i64 %d.add, ptr addrspace(1) %d
+
+  ; CHECK: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load float, ptr addrspace(1) %c
+  %e.add = fadd float %e.load, 1.
+  ; CHECK: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store float %e.add, ptr addrspace(1) %c
+
+  ; CHECK: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load double, ptr addrspace(1) %c
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store double %f.add, ptr addrspace(1) %c
+
+  ret void
+}
+
+; CHECK-LABEL: global_volatile
+define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
+  ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load volatile i8, ptr addrspace(1) %a
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store volatile i8 %a.add, ptr addrspace(1) %a
+
+  ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load volatile i16, ptr addrspace(1) %b
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store volatile i16 %b.add, ptr addrspace(1) %b
+
+  ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load volatile i32, ptr addrspace(1) %c
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store volatile i32 %c.add, ptr addrspace(1) %c
+
+  ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load volatile i64, ptr addrspace(1) %d
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store volatile i64 %d.add, ptr addrspace(1) %d
+
+  ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load volatile float, ptr addrspace(1) %c
+  %e.add = fadd float %e.load, 1.
+  ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store volatile float %e.add, ptr addrspace(1) %c
+
+  ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load volatile double, ptr addrspace(1) %c
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store volatile double %f.add, ptr addrspace(1) %c
+
+  ret void
+}
+
+; CHECK-LABEL: global_unordered
+define void @global_unordered(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+  ; CHECK: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1
+
+  ; CHECK: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2
+
+  ; CHECK: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4
+
+  ; CHECK: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8
+
+  ; CHECK: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(1) %e unordered, align 4
+
+  ; CHECK: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(1) %e unordered, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: global_monotonic
+define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+  ; CHECK: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1
+
+  ; CHECK: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2
+
+  ; CHECK: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4
+
+  ; CHECK: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8
+
+  ; CHECK: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4
+
+  ; CHECK: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: global_unordered_volatile
+define void @global_unordered_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+  ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1
+
+  ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2
+
+  ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4
+
+  ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8
+
+  ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4
+
+  ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: global_monotonic_volatile
+define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+  ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1
+
+  ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2
+
+  ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4
+
+  ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8
+
+  ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4
+
+  ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: global_acq_rel
+define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+  ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(1) %a acquire, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(1) %a release, align 1
+
+  ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(1) %b acquire, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(1) %b release, align 2
+
+  ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(1) %c acquire, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(1) %c release, align 4
+
+  ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(1) %d acquire, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(1) %d release, align 8
+
+  ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(1) %e acquire, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(1) %e release, align 4
+
+  ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(1) %e acquire, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(1) %e release, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: global_acq_rel_volatile
+define void @global_acq_rel_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+  ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(1) %a acquire, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(1) %a release, align 1
+
+  ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(1) %b acquire, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(1) %b release, align 2
+
+  ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(1) %c acquire, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(1) %c release, align 4
+
+  ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(1) %d acquire, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(1) %d release, align 8
+
+  ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(1) %e acquire, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(1) %e release, align 4
+
+  ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(1) %e acquire, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(1) %e release, align 8
+
+  ret void
+}
+
+;; shared statespace
+
+; CHECK-LABEL: shared_plain
+define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
+  ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load i8, ptr addrspace(3) %a
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store i8 %a.add, ptr addrspace(3) %a
+
+  ; CHECK: ld.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load i16, ptr addrspace(3) %b
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store i16 %b.add, ptr addrspace(3) %b
+
+  ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load i32, ptr addrspace(3) %c
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store i32 %c.add, ptr addrspace(3) %c
+
+  ; CHECK: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load i64, ptr addrspace(3) %d
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store i64 %d.add, ptr addrspace(3) %d
+
+  ; CHECK: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load float, ptr addrspace(3) %c
+  %e.add = fadd float %e.load, 1.
+  ; CHECK: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store float %e.add, ptr addrspace(3) %c
+
+  ; CHECK: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load double, ptr addrspace(3) %c
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store double %f.add, ptr addrspace(3) %c
+
+  ret void
+}
+
+; CHECK-LABEL: shared_volatile
+define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
+  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load volatile i8, ptr addrspace(3) %a
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store volatile i8 %a.add, ptr addrspace(3) %a
+
+  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load volatile i16, ptr addrspace(3) %b
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store volatile i16 %b.add, ptr addrspace(3) %b
+
+  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load volatile i32, ptr addrspace(3) %c
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store volatile i32 %c.add, ptr addrspace(3) %c
+
+  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load volatile i64, ptr addrspace(3) %d
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store volatile i64 %d.add, ptr addrspace(3) %d
+
+  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load volatile float, ptr addrspace(3) %c
+  %e.add = fadd float %e.load, 1.
+  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store volatile float %e.add, ptr addrspace(3) %c
+
+  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load volatile double, ptr addrspace(3) %c
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store volatile double %f.add, ptr addrspace(3) %c
+
+  ret void
+}
+
+; CHECK-LABEL: shared_unordered
+define void @shared_unordered(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+  ; CHECK: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1
+
+  ; CHECK: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2
+
+  ; CHECK: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4
+
+  ; CHECK: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8
+
+  ; CHECK: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(3) %e unordered, align 4
+
+  ; CHECK: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(3) %e unordered, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: shared_unordered_volatile
+define void @shared_unordered_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1
+
+  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2
+
+  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4
+
+  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8
+
+  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4
+
+  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: shared_monotonic
+define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+  ; CHECK: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+
+  ; CHECK: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2
+
+  ; CHECK: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4
+
+  ; CHECK: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8
+
+  ; CHECK: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4
+
+  ; CHECK: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: shared_monotonic_volatile
+define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+
+  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2
+
+  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4
+
+  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8
+
+  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4
+
+  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: shared_acq_rel
+define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+  ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(3) %a acquire, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(3) %a release, align 1
+
+  ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(3) %b acquire, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(3) %b release, align 2
+
+  ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(3) %c acquire, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(3) %c release, align 4
+
+  ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(3) %d acquire, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(3) %d release, align 8
+
+  ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(3) %e acquire, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(3) %e release, align 4
+
+  ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(3) %e acquire, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(3) %e release, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_volatile
+define void @shared_acq_rel_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+  ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(3) %a acquire, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(3) %a release, align 1
+
+  ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(3) %b acquire, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(3) %b release, align 2
+
+  ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(3) %c acquire, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(3) %c release, align 4
+
+  ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(3) %d acquire, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(3) %d release, align 8
+
+  ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(3) %e acquire, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(3) %e release, align 4
+
+  ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(3) %e acquire, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(3) %e release, align 8
+
+  ret void
+}
+
+;; local statespace
+
+; CHECK-LABEL: local_plain
+define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
+  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load i8, ptr addrspace(5) %a
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store i8 %a.add, ptr addrspace(5) %a
+
+  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load i16, ptr addrspace(5) %b
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store i16 %b.add, ptr addrspace(5) %b
+
+  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load i32, ptr addrspace(5) %c
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store i32 %c.add, ptr addrspace(5) %c
+
+  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load i64, ptr addrspace(5) %d
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store i64 %d.add, ptr addrspace(5) %d
+
+  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load float, ptr addrspace(5) %c
+  %e.add = fadd float %e.load, 1.
+  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store float %e.add, ptr addrspace(5) %c
+
+  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load double, ptr addrspace(5) %c
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store double %f.add, ptr addrspace(5) %c
+
+  ret void
+}
+
+; CHECK-LABEL: local_volatile
+define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
+  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load volatile i8, ptr addrspace(5) %a
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store volatile i8 %a.add, ptr addrspace(5) %a
+
+  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load volatile i16, ptr addrspace(5) %b
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store volatile i16 %b.add, ptr addrspace(5) %b
+
+  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load volatile i32, ptr addrspace(5) %c
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store volatile i32 %c.add, ptr addrspace(5) %c
+
+  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load volatile i64, ptr addrspace(5) %d
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store volatile i64 %d.add, ptr addrspace(5) %d
+
+  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load volatile float, ptr addrspace(5) %c
+  %e.add = fadd float %e.load, 1.
+  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store volatile float %e.add, ptr addrspace(5) %c
+
+  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load volatile double, ptr addrspace(5) %c
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store volatile double %f.add, ptr addrspace(5) %c
+
+  ret void
+}
+
+; CHECK-LABEL: local_unordered
+define void @local_unordered(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1
+
+  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2
+
+  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4
+
+  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8
+
+  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(5) %e unordered, align 4
+
+  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(5) %e unordered, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: local_unordered_volatile
+define void @local_unordered_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1
+
+  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2
+
+  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4
+
+  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8
+
+  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4
+
+  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: local_monotonic
+define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+
+  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2
+
+  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4
+
+  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8
+
+  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4
+
+  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: local_monotonic_volatile
+define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+
+  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2
+
+  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4
+
+  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8
+
+  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4
+
+  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: local_acq_rel
+define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(5) %a release, align 1
+
+  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(5) %b acquire, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(5) %b release, align 2
+
+  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(5) %c acquire, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(5) %c release, align 4
+
+  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(5) %d acquire, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(5) %d release, align 8
+
+  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(5) %e acquire, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(5) %e release, align 4
+
+  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(5) %e acquire, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(5) %e release, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: local_acq_rel_volatile
+define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(5) %a release, align 1
+
+  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(5) %b acquire, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(5) %b release, align 2
+
+  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(5) %c acquire, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(5) %c release, align 4
+
+  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(5) %d acquire, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(5) %d release, align 8
+
+  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4
+
+  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(5) %e acquire, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(5) %e release, align 8
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll
index c477bd9e744cd..4c5e0920ce1ae 100644
--- a/llvm/test/CodeGen/NVPTX/load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store.ll
@@ -1,8 +1,10 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
-; CHECK-LABEL: plain
-define void @plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
+; generic statespace
+
+; CHECK-LABEL: generic_plain
+define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
   ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load i8, ptr %a
   %a.add = add i8 %a.load, 1
@@ -42,8 +44,8 @@ define void @plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
   ret void
 }
 
-; CHECK-LABEL: volatile
-define void @volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
+; CHECK-LABEL: generic_volatile
+define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
   ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load volatile i8, ptr %a
   %a.add = add i8 %a.load, 1
@@ -83,8 +85,8 @@ define void @volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
   ret void
 }
 
-; CHECK-LABEL: monotonic
-define void @monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_monotonic
+define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
   ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr %a monotonic, align 1
   %a.add = add i8 %a.load, 1
@@ -123,3 +125,870 @@ define void @monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_add
 
   ret void
 }
+
+; CHECK-LABEL: generic_monotonic_volatile
+define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr %a monotonic, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr %a monotonic, align 1
+
+  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr %b monotonic, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr %b monotonic, align 2
+
+  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr %c monotonic, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr %c monotonic, align 4
+
+  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr %d monotonic, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr %d monotonic, align 8
+
+  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr %e monotonic, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr %e monotonic, align 4
+
+  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr %e monotonic, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr %e monotonic, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: generic_unordered
+define void @generic_unordered(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr %a unordered, align 1
+
+  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr %b unordered, align 2
+
+  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr %c unordered, align 4
+
+  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr %d unordered, align 8
+
+  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr %e unordered, align 4
+
+  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr %e unordered, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: generic_unordered_volatile
+define void @generic_unordered_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr %a unordered, align 1
+
+  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr %b unordered, align 2
+
+  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr %c unordered, align 4
+
+  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr %d unordered, align 8
+
+  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr %e unordered, align 4
+
+  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr %e unordered, align 8
+
+  ret void
+}
+
+;; global statespace
+
+; CHECK-LABEL: global_plain
+define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
+  ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load i8, ptr addrspace(1) %a
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store i8 %a.add, ptr addrspace(1) %a
+
+  ; CHECK: ld.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load i16, ptr addrspace(1) %b
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store i16 %b.add, ptr addrspace(1) %b
+
+  ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load i32, ptr addrspace(1) %c
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store i32 %c.add, ptr addrspace(1) %c
+
+  ; CHECK: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load i64, ptr addrspace(1) %d
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store i64 %d.add, ptr addrspace(1) %d
+
+  ; CHECK: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load float, ptr addrspace(1) %c
+  %e.add = fadd float %e.load, 1.
+  ; CHECK: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store float %e.add, ptr addrspace(1) %c
+
+  ; CHECK: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load double, ptr addrspace(1) %c
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store double %f.add, ptr addrspace(1) %c
+
+  ret void
+}
+
+; CHECK-LABEL: global_volatile
+define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
+  ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load volatile i8, ptr addrspace(1) %a
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store volatile i8 %a.add, ptr addrspace(1) %a
+
+  ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load volatile i16, ptr addrspace(1) %b
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store volatile i16 %b.add, ptr addrspace(1) %b
+
+  ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load volatile i32, ptr addrspace(1) %c
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store volatile i32 %c.add, ptr addrspace(1) %c
+
+  ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load volatile i64, ptr addrspace(1) %d
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store volatile i64 %d.add, ptr addrspace(1) %d
+
+  ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load volatile float, ptr addrspace(1) %c
+  %e.add = fadd float %e.load, 1.
+  ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store volatile float %e.add, ptr addrspace(1) %c
+
+  ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load volatile double, ptr addrspace(1) %c
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store volatile double %f.add, ptr addrspace(1) %c
+
+  ret void
+}
+
+; CHECK-LABEL: global_monotonic
+define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1
+
+  ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2
+
+  ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4
+
+  ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8
+
+  ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4
+
+  ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: global_monotonic_volatile
+define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1
+
+  ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2
+
+  ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4
+
+  ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8
+
+  ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4
+
+  ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: global_unordered
+define void @global_unordered(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1
+
+  ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2
+
+  ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4
+
+  ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8
+
+  ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(1) %e unordered, align 4
+
+  ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(1) %e unordered, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: global_unordered_volatile
+define void @global_unordered_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1
+
+  ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2
+
+  ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4
+
+  ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8
+
+  ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4
+
+  ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8
+
+  ret void
+}
+
+;; shared statespace
+
+; CHECK-LABEL: shared_plain
+define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
+  ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load i8, ptr addrspace(3) %a
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store i8 %a.add, ptr addrspace(3) %a
+
+  ; CHECK: ld.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load i16, ptr addrspace(3) %b
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store i16 %b.add, ptr addrspace(3) %b
+
+  ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load i32, ptr addrspace(3) %c
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store i32 %c.add, ptr addrspace(3) %c
+
+  ; CHECK: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load i64, ptr addrspace(3) %d
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store i64 %d.add, ptr addrspace(3) %d
+
+  ; CHECK: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load float, ptr addrspace(3) %c
+  %e.add = fadd float %e.load, 1.
+  ; CHECK: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store float %e.add, ptr addrspace(3) %c
+
+  ; CHECK: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load double, ptr addrspace(3) %c
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store double %f.add, ptr addrspace(3) %c
+
+  ret void
+}
+
+; CHECK-LABEL: shared_volatile
+define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
+  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load volatile i8, ptr addrspace(3) %a
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store volatile i8 %a.add, ptr addrspace(3) %a
+
+  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load volatile i16, ptr addrspace(3) %b
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store volatile i16 %b.add, ptr addrspace(3) %b
+
+  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load volatile i32, ptr addrspace(3) %c
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store volatile i32 %c.add, ptr addrspace(3) %c
+
+  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load volatile i64, ptr addrspace(3) %d
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store volatile i64 %d.add, ptr addrspace(3) %d
+
+  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load volatile float, ptr addrspace(3) %c
+  %e.add = fadd float %e.load, 1.
+  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store volatile float %e.add, ptr addrspace(3) %c
+
+  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load volatile double, ptr addrspace(3) %c
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store volatile double %f.add, ptr addrspace(3) %c
+
+  ret void
+}
+
+; CHECK-LABEL: shared_monotonic
+define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+
+  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2
+
+  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4
+
+  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8
+
+  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4
+
+  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: shared_monotonic_volatile
+define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+
+  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2
+
+  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4
+
+  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8
+
+  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4
+
+  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: shared_unordered
+define void @shared_unordered(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1
+
+  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2
+
+  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4
+
+  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8
+
+  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(3) %e unordered, align 4
+
+  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(3) %e unordered, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: shared_unordered_volatile
+define void @shared_unordered_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1
+
+  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2
+
+  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4
+
+  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8
+
+  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4
+
+  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8
+
+  ret void
+}
+
+;; local statespace
+
+; CHECK-LABEL: local_plain
+define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
+  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load i8, ptr addrspace(5) %a
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store i8 %a.add, ptr addrspace(5) %a
+
+  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load i16, ptr addrspace(5) %b
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store i16 %b.add, ptr addrspace(5) %b
+
+  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load i32, ptr addrspace(5) %c
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store i32 %c.add, ptr addrspace(5) %c
+
+  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load i64, ptr addrspace(5) %d
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store i64 %d.add, ptr addrspace(5) %d
+
+  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load float, ptr addrspace(5) %c
+  %e.add = fadd float %e.load, 1.
+  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store float %e.add, ptr addrspace(5) %c
+
+  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load double, ptr addrspace(5) %c
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store double %f.add, ptr addrspace(5) %c
+
+  ret void
+}
+
+; CHECK-LABEL: local_volatile
+define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
+  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load volatile i8, ptr addrspace(5) %a
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store volatile i8 %a.add, ptr addrspace(5) %a
+
+  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load volatile i16, ptr addrspace(5) %b
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store volatile i16 %b.add, ptr addrspace(5) %b
+
+  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load volatile i32, ptr addrspace(5) %c
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store volatile i32 %c.add, ptr addrspace(5) %c
+
+  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load volatile i64, ptr addrspace(5) %d
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store volatile i64 %d.add, ptr addrspace(5) %d
+
+  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load volatile float, ptr addrspace(5) %c
+  %e.add = fadd float %e.load, 1.
+  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store volatile float %e.add, ptr addrspace(5) %c
+
+  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load volatile double, ptr addrspace(5) %c
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store volatile double %f.add, ptr addrspace(5) %c
+
+  ret void
+}
+
+; CHECK-LABEL: local_monotonic
+define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+
+  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2
+
+  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4
+
+  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8
+
+  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4
+
+  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: local_monotonic_volatile
+define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+
+  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2
+
+  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4
+
+  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8
+
+  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4
+
+  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: local_unordered
+define void @local_unordered(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1
+
+  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2
+
+  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4
+
+  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8
+
+  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic float %e.add, ptr addrspace(5) %e unordered, align 4
+
+  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic double %f.add, ptr addrspace(5) %e unordered, align 8
+
+  ret void
+}
+
+; CHECK-LABEL: local_unordered_volatile
+define void @local_unordered_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1
+  %a.add = add i8 %a.load, 1
+  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1
+
+  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2
+  %b.add = add i16 %b.load, 1
+  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2
+
+  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4
+  %c.add = add i32 %c.load, 1
+  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4
+
+  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8
+  %d.add = add i64 %d.load, 1
+  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8
+
+  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4
+  %e.add = fadd float %e.load, 1.0
+  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4
+
+  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8
+  %f.add = fadd double %f.load, 1.
+  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8
+
+  ret void
+}

>From 0dbd72d6abfd390fc62eab43e195c21feb5e9bd0 Mon Sep 17 00:00:00 2001
From: Brendan Dahl <brendan.dahl at gmail.com>
Date: Wed, 24 Jul 2024 11:55:36 -0700
Subject: [PATCH 059/259] [WebAssembly] Implement f16x8.replace_lane
 instruction. (#99388)

Use a builtin and intrinsic until half types are better supported for
instruction selection.
---
 clang/include/clang/Basic/BuiltinsWebAssembly.def   |  1 +
 clang/lib/CodeGen/CGBuiltin.cpp                     |  7 +++++++
 clang/test/CodeGen/builtins-wasm.c                  |  6 ++++++
 llvm/include/llvm/IR/IntrinsicsWebAssembly.td       |  4 ++++
 llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td | 13 +++++++++++++
 llvm/test/CodeGen/WebAssembly/half-precision.ll     |  8 ++++++++
 llvm/test/MC/WebAssembly/simd-encodings.s           |  3 +++
 7 files changed, 42 insertions(+)

diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
index 2a45f8a6582a2..df304a71e475e 100644
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -201,6 +201,7 @@ TARGET_BUILTIN(__builtin_wasm_loadf16_f32, "fh*", "nU", "half-precision")
 TARGET_BUILTIN(__builtin_wasm_storef16_f32, "vfh*", "n", "half-precision")
 TARGET_BUILTIN(__builtin_wasm_splat_f16x8, "V8hf", "nc", "half-precision")
 TARGET_BUILTIN(__builtin_wasm_extract_lane_f16x8, "fV8hi", "nc", "half-precision")
+TARGET_BUILTIN(__builtin_wasm_replace_lane_f16x8, "V8hV8hif", "nc", "half-precision")
 
 // Reference Types builtins
 // Some builtins are custom type-checked - see 't' as part of the third argument,
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a0d03b87ccdc9..f1dee801e4fe7 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -21567,6 +21567,13 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_extract_lane_f16x8);
     return Builder.CreateCall(Callee, {Vector, Index});
   }
+  case WebAssembly::BI__builtin_wasm_replace_lane_f16x8: {
+    Value *Vector = EmitScalarExpr(E->getArg(0));
+    Value *Index = EmitScalarExpr(E->getArg(1));
+    Value *Val = EmitScalarExpr(E->getArg(2));
+    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_replace_lane_f16x8);
+    return Builder.CreateCall(Callee, {Vector, Index, Val});
+  }
   case WebAssembly::BI__builtin_wasm_table_get: {
     assert(E->getArg(0)->getType()->isArrayType());
     Value *Table = EmitArrayToPointerDecay(E->getArg(0)).emitRawPointer(*this);
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index 75861b1b4bd6d..f494aeada0157 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -840,6 +840,12 @@ float extract_lane_f16x8(f16x8 a, int i) {
   return __builtin_wasm_extract_lane_f16x8(a, i);
 }
 
+f16x8 replace_lane_f16x8(f16x8 a, int i, float v) {
+  // WEBASSEMBLY:  %0 = tail call <8 x half> @llvm.wasm.replace.lane.f16x8(<8 x half> %a, i32 %i, float %v)
+  // WEBASSEMBLY-NEXT: ret <8 x half> %0
+  return __builtin_wasm_replace_lane_f16x8(a, i, v);
+}
+
 f16x8 min_f16x8(f16x8 a, f16x8 b) {
   // WEBASSEMBLY:  %0 = tail call <8 x half> @llvm.minimum.v8f16(<8 x half> %a, <8 x half> %b)
   // WEBASSEMBLY-NEXT: ret <8 x half> %0
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index 47aab196a6d4f..4d2df1c44ebce 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -363,6 +363,10 @@ def int_wasm_extract_lane_f16x8:
   DefaultAttrsIntrinsic<[llvm_float_ty],
                         [llvm_v8f16_ty, llvm_i32_ty],
                         [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_replace_lane_f16x8:
+  DefaultAttrsIntrinsic<[llvm_v8f16_ty],
+                        [llvm_v8f16_ty, llvm_i32_ty, llvm_float_ty],
+                        [IntrNoMem, IntrSpeculatable]>;
 
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index d6c6425b10041..26fe61b1d6051 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -702,6 +702,19 @@ defm "" : ReplaceLane<I64x2, 30>;
 defm "" : ReplaceLane<F32x4, 32>;
 defm "" : ReplaceLane<F64x2, 34>;
 
+// For now use an instrinsic for f16x8.replace_lane instead of ReplaceLane above
+// since LLVM IR generated with half type arguments is not well supported and
+// creates conversions from f16->f32.
+defm REPLACE_LANE_F16x8 :
+  HALF_PRECISION_I<(outs V128:$dst), (ins V128:$vec, vec_i8imm_op:$idx, F32:$x),
+                   (outs), (ins vec_i8imm_op:$idx),
+                   [(set (v8f16 V128:$dst), (int_wasm_replace_lane_f16x8
+                     (v8f16 V128:$vec),
+                     (i32 LaneIdx8:$idx),
+                     (f32 F32:$x)))],
+                   "f16x8.replace_lane\t$dst, $vec, $idx, $x",
+                   "f16x8.replace_lane\t$idx", 0x122>;
+
 // Lower undef lane indices to zero
 def : Pat<(vector_insert (v16i8 V128:$vec), I32:$x, undef),
           (REPLACE_LANE_I8x16 $vec, 0, $x)>;
diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll
index fa78f5f9591d6..dba4138ad59cc 100644
--- a/llvm/test/CodeGen/WebAssembly/half-precision.ll
+++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll
@@ -36,6 +36,14 @@ define float @extract_lane_v8f16(<8 x half> %v) {
   ret float %r
 }
 
+; CHECK-LABEL: replace_lane_v8f16:
+; CHECK:       f16x8.replace_lane $push0=, $0, 1, $1
+; CHECK-NEXT:  return $pop0
+define <8 x half> @replace_lane_v8f16(<8 x half> %v, float %f) {
+  %r = call <8 x half> @llvm.wasm.replace.lane.f16x8(<8 x half> %v, i32 1, float %f)
+  ret <8 x half> %r
+}
+
 ; CHECK-LABEL: add_v8f16:
 ; CHECK:       f16x8.add $push0=, $0, $1
 ; CHECK-NEXT:  return $pop0
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index 8c3483bfaad7a..7ae4d47d888cf 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -851,6 +851,9 @@ main:
     # CHECK: f16x8.extract_lane 1 # encoding: [0xfd,0xa1,0x02,0x01]
     f16x8.extract_lane 1
 
+    # CHECK: f16x8.replace_lane 1 # encoding: [0xfd,0xa2,0x02,0x01]
+    f16x8.replace_lane 1
+
     # CHECK: f16x8.add # encoding: [0xfd,0xb4,0x02]
     f16x8.add
 

>From b00fddec2db19a372bb4cec5558163214e6f0043 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Wed, 24 Jul 2024 12:03:20 -0700
Subject: [PATCH 060/259] [RISCV] Add test cases for failures to form widening
 FMA instructions. NFC

If the fp_extend is in the scalar domain before the shuffle, we
won't recognize the widening opportunity.
---
 .../RISCV/rvv/fixed-vectors-vfwmacc.ll        | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll
index bcf5d7b3365c3..1803b52aca674 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll
@@ -2027,3 +2027,75 @@ define <8 x double> @vfwnmsac_fv_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, h
   %vg = call <8 x double> @llvm.fma.v8f64(<8 x double> %vd, <8 x double> %vf, <8 x double> %va)
   ret <8 x double> %vg
 }
+
+define <2 x float> @vfwmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
+; CHECK-LABEL: vfwmacc_vf2_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fcvt.s.h fa5, fa0
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v10, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfmacc.vf v8, fa5, v10
+; CHECK-NEXT:    ret
+  %cext = fpext half %c to float
+  %head = insertelement <2 x float> poison, float %cext, i32 0
+  %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer
+  %vd = fpext <2 x half> %vb to <2 x float>
+  %vf = call <2 x float> @llvm.fma.v2f32(<2 x float> %vd, <2 x float> %splat, <2 x float> %va)
+  ret <2 x float> %vf
+}
+
+define <2 x float> @vfwmsac_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
+; CHECK-LABEL: vfwmsac_vf2_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fcvt.s.h fa5, fa0
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v10, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfmsac.vf v8, fa5, v10
+; CHECK-NEXT:    ret
+  %cext = fpext half %c to float
+  %head = insertelement <2 x float> poison, float %cext, i32 0
+  %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer
+  %vd = fpext <2 x half> %vb to <2 x float>
+  %ve = fneg <2 x float> %va
+  %vf = call <2 x float> @llvm.fma.v2f32(<2 x float> %vd, <2 x float> %splat, <2 x float> %ve)
+  ret <2 x float> %vf
+}
+
+define <2 x float> @vfwnmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
+; CHECK-LABEL: vfwnmacc_vf2_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fcvt.s.h fa5, fa0
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v10, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfnmacc.vf v8, fa5, v10
+; CHECK-NEXT:    ret
+  %cext = fpext half %c to float
+  %head = insertelement <2 x float> poison, float %cext, i32 0
+  %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer
+  %vd = fpext <2 x half> %vb to <2 x float>
+  %vf = fneg <2 x float> %va
+  %vg = fneg <2 x float> %vd
+  %vh = call <2 x float> @llvm.fma.v2f32(<2 x float> %vg, <2 x float> %splat, <2 x float> %vf)
+  ret <2 x float> %vh
+}
+
+define <2 x float> @vfwnmsac_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
+; CHECK-LABEL: vfwnmsac_vf2_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fcvt.s.h fa5, fa0
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v10, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfnmsac.vf v8, fa5, v10
+; CHECK-NEXT:    ret
+  %cext = fpext half %c to float
+  %head = insertelement <2 x float> poison, float %cext, i32 0
+  %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer
+  %vd = fpext <2 x half> %vb to <2 x float>
+  %vf = fneg <2 x float> %vd
+  %vg = call <2 x float> @llvm.fma.v2f32(<2 x float> %vf, <2 x float> %splat, <2 x float> %va)
+  ret <2 x float> %vg
+}

>From 59e07f34bdbd463254b0b2744bd0a587d2cd6438 Mon Sep 17 00:00:00 2001
From: Gang Chen <gangc at amd.com>
Date: Wed, 24 Jul 2024 12:04:25 -0700
Subject: [PATCH 061/259] [AMDGPU][GlobaISel] wrap the load-splitting code in
 RegBank selection with condition (#98966)

The load-splitting code in RegBank selection is only relevant to those
listed address-spaces because there are cases in those address-spaces in
which we are not sure how far to split during legalization

---------

Signed-off-by: gangc <gangc at amd.com>
---
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  | 32 +++++++++++--------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 4a3f327e4c591..8da8c94b4d665 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1059,6 +1059,7 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(
   Register DstReg = MI.getOperand(0).getReg();
   const LLT LoadTy = MRI.getType(DstReg);
   unsigned LoadSize = LoadTy.getSizeInBits();
+  MachineMemOperand *MMO = *MI.memoperands_begin();
   const unsigned MaxNonSmrdLoadSize = 128;
 
   const RegisterBank *DstBank =
@@ -1069,7 +1070,6 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(
     if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
       return false;
 
-    MachineMemOperand *MMO = *MI.memoperands_begin();
     const unsigned MemSize = 8 * MMO->getSize().getValue();
     // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
     // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
@@ -1141,25 +1141,29 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(
   if (SrcRegs.empty())
     SrcRegs.push_back(MI.getOperand(1).getReg());
 
-  assert(LoadSize % MaxNonSmrdLoadSize == 0);
-
   // RegBankSelect only emits scalar types, so we need to reset the pointer
   // operand to a pointer type.
   Register BasePtrReg = SrcRegs[0];
   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
   MRI.setType(BasePtrReg, PtrTy);
 
-  unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
-  const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
-  ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
-  LegalizerHelper Helper(B.getMF(), O, B);
-
-  if (LoadTy.isVector()) {
-    if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
-      return false;
-  } else {
-    if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
-      return false;
+  // The following are the loads not splitted enough during legalization
+  // because it was not clear they are smem-load or vmem-load
+  if (AMDGPU::isExtendedGlobalAddrSpace(MMO->getAddrSpace()) ||
+      MMO->getAddrSpace() == AMDGPUAS::BUFFER_RESOURCE) {
+    assert(LoadSize % MaxNonSmrdLoadSize == 0);
+    unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
+    const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
+    ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
+    LegalizerHelper Helper(B.getMF(), O, B);
+    if (LoadTy.isVector()) {
+      if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) !=
+          LegalizerHelper::Legalized)
+        return false;
+    } else {
+      if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
+        return false;
+    }
   }
 
   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);

>From 2e3ee31d2901bd1faeb57703992358130d5f2dcc Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 24 Jul 2024 14:16:16 -0500
Subject: [PATCH 062/259] [libc] Enable 'sscanf' on the GPU #100211

Summary:
We can enable the sscanf function on the GPU now. This required adding
the configs to the scanf list so that the GPU build didn't do float
conversions.
---
 libc/config/gpu/entrypoints.txt     | 1 +
 libc/docs/gpu/support.rst           | 9 +++++----
 libc/test/src/stdio/CMakeLists.txt  | 9 +++++++++
 libc/test/src/stdio/sscanf_test.cpp | 4 ++++
 4 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index fa878d8999227..f7e89bdcd95cc 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -186,6 +186,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdio.sprintf
     libc.src.stdio.vsnprintf
     libc.src.stdio.vsprintf
+    libc.src.stdio.sscanf
     libc.src.stdio.feof
     libc.src.stdio.ferror
     libc.src.stdio.fflush
diff --git a/libc/docs/gpu/support.rst b/libc/docs/gpu/support.rst
index 71dd1b30c5e3a..5ef298a2ba58f 100644
--- a/libc/docs/gpu/support.rst
+++ b/libc/docs/gpu/support.rst
@@ -234,10 +234,11 @@ printf         |check|    |check|
 vprintf        |check|    |check|
 fprintf        |check|    |check|
 vfprintf       |check|    |check|
-sprintf        |check|    |check|
-snprintf       |check|    |check|
-vsprintf       |check|    |check|
-vsnprintf      |check|    |check|
+sprintf        |check|
+snprintf       |check|
+vsprintf       |check|
+vsnprintf      |check|
+sscanf         |check|
 putchar        |check|    |check|
 fclose         |check|    |check|
 fopen          |check|    |check|
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 5eb8c9577893b..10ec890b043a7 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -261,6 +261,13 @@ add_libc_test(
     ${use_system_file}
 )
 
+if(LIBC_CONF_SCANF_DISABLE_FLOAT)
+  list(APPEND sscanf_test_copts "-DLIBC_COPT_SCANF_DISABLE_FLOAT")
+endif()
+if(LIBC_CONF_SCANF_DISABLE_INDEX_MODE)
+  list(APPEND sscanf_test_copts "-DLIBC_COPT_SCANF_DISABLE_INDEX_MODE")
+endif()
+
 add_libc_test(
   sscanf_test
   SUITE
@@ -271,6 +278,8 @@ add_libc_test(
     libc.src.stdio.sscanf
   LINK_LIBRARIES
     LibcFPTestHelpers
+  COMPILE_OPTIONS
+    ${sscanf_test_copts}
 )
 
 add_libc_test(
diff --git a/libc/test/src/stdio/sscanf_test.cpp b/libc/test/src/stdio/sscanf_test.cpp
index 741815bb15171..59be4e6de6ed6 100644
--- a/libc/test/src/stdio/sscanf_test.cpp
+++ b/libc/test/src/stdio/sscanf_test.cpp
@@ -226,6 +226,7 @@ TEST(LlvmLibcSScanfTest, IntConvNoWriteTests) {
   EXPECT_EQ(result, 0);
 }
 
+#ifndef LIBC_COPT_SCANF_DISABLE_FLOAT
 TEST(LlvmLibcSScanfTest, FloatConvSimple) {
   int ret_val;
   float result = 0;
@@ -580,7 +581,9 @@ TEST(LlvmLibcSScanfTest, FloatConvNoWrite) {
   ret_val = LIBC_NAMESPACE::sscanf("Not a float", "%*f", &result);
   EXPECT_EQ(ret_val, 0);
 }
+#endif
 
+#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
 TEST(LlvmLibcSScanfTest, CurPosCombined) {
   int ret_val;
   int result = -1;
@@ -628,6 +631,7 @@ TEST(LlvmLibcSScanfTest, CurPosCombined) {
   EXPECT_EQ(ret_val, 1);
   EXPECT_EQ(result, 320);
 }
+#endif
 
 TEST(LlvmLibcSScanfTest, PointerConvCombined) {
   int ret_val;

>From 8c7188aa4c1468971d18e9dff8f2b85aa7e9bfd9 Mon Sep 17 00:00:00 2001
From: Kevin McAfee <kmcafee at nvidia.com>
Date: Wed, 24 Jul 2024 12:27:55 -0700
Subject: [PATCH 063/259] Update NVVM ldu/ldg intrinsics with IntrWillReturn
 and test for DCE (#98968)

Dead calls to these intrinsics were not being deleted at the IR level as
they were not marked `IntrWillReturn`, though they were being deleted
when building the SDAG. This fixes that and adds a test to confirm they
are deleted during `opt`
---
 llvm/include/llvm/IR/IntrinsicsNVVM.td        |  12 +-
 .../Transforms/DCE/nvvm-ldu-ldg-willreturn.ll | 187 ++++++++++++++++++
 2 files changed, 193 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/Transforms/DCE/nvvm-ldu-ldg-willreturn.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 854eb2f8dd6df..1e7fdb53059e2 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1529,30 +1529,30 @@ def int_nvvm_mbarrier_pending_count :
 // pointer's alignment.
 def int_nvvm_ldu_global_i : Intrinsic<[llvm_anyint_ty],
   [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>],
+  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldu.global.i">;
 def int_nvvm_ldu_global_f : Intrinsic<[llvm_anyfloat_ty],
   [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>],
+  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldu.global.f">;
 def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty],
   [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>],
+  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldu.global.p">;
 
 // Generated within nvvm. Use for ldg on sm_35 or later.  Second arg is the
 // pointer's alignment.
 def int_nvvm_ldg_global_i : Intrinsic<[llvm_anyint_ty],
   [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>],
+  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldg.global.i">;
 def int_nvvm_ldg_global_f : Intrinsic<[llvm_anyfloat_ty],
   [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>],
+  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldg.global.f">;
 def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty],
   [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>],
+  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldg.global.p">;
 
 // Use for generic pointers
diff --git a/llvm/test/Transforms/DCE/nvvm-ldu-ldg-willreturn.ll b/llvm/test/Transforms/DCE/nvvm-ldu-ldg-willreturn.ll
new file mode 100644
index 0000000000000..64a023ef45137
--- /dev/null
+++ b/llvm/test/Transforms/DCE/nvvm-ldu-ldg-willreturn.ll
@@ -0,0 +1,187 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S < %s -passes=dce  | FileCheck %s
+
+; ldu/ldg intrinsics were erroneously not marked IntrWillReturn, preventing
+; them from being eliminated at IR level when dead.
+
+declare i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare ptr @llvm.nvvm.ldu.global.p.p1(ptr addrspace(1) %ptr, i32 %align)
+declare float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
+declare <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align)
+
+declare i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare ptr @llvm.nvvm.ldg.global.p.p1(ptr addrspace(1) %ptr, i32 %align)
+declare float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align)
+declare double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align)
+declare half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
+declare <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align)
+
+define void @test_ldu_i8_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldu_i8_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
+  ret void
+}
+
+define void @test_ldu_i16_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldu_i16_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
+  ret void
+}
+
+define void @test_ldu_i32_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldu_i32_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
+  ret void
+}
+
+define void @test_ldu_i64_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldu_i64_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret void
+}
+
+define void @test_ldu_p_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldu_p_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call ptr @llvm.nvvm.ldu.global.p.p1(ptr addrspace(1) %ptr, i32 8)
+  ret void
+}
+
+define void @test_ldu_f32_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldu_f32_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
+  ret void
+}
+
+define void @test_ldu_f64_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldu_f64_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret void
+}
+
+define void @test_ldu_f16_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldu_f16_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 2)
+  ret void
+}
+
+define void @test_ldu_v2f16_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldu_v2f16_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret void
+}
+
+define void @test_ldg_i8_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldg_i8_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
+  ret void
+}
+
+define void @test_ldg_i16_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldg_i16_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
+  ret void
+}
+
+define void @test_ldg_i32_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldg_i32_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
+  ret void
+}
+
+define void @test_ldg_i64_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldg_i64_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret void
+}
+
+define void @test_ldg_p_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldg_p_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call ptr @llvm.nvvm.ldg.global.p.p1(ptr addrspace(1) %ptr, i32 8)
+  ret void
+}
+
+define void @test_ldg_f32_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldg_f32_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
+  ret void
+}
+
+define void @test_ldg_f64_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldg_f64_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
+  ret void
+}
+
+define void @test_ldg_f16_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldg_f16_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 2)
+  ret void
+}
+
+define void @test_ldg_v2f16_dead(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @test_ldg_v2f16_dead(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %val = tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
+  ret void
+}

>From deb40a253a5448a7b1f8d9680a0ae512c2d17283 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Wed, 24 Jul 2024 12:23:39 -0700
Subject: [PATCH 064/259] [RISCV] Add missing CHECK prefix to
 fixed-vectors-vfw-web-simplification.ll. NFC

The FOLDING prefix was ambiguous on one of the test cases. It would be
nice if the update script reported this.
---
 .../rvv/fixed-vectors-vfw-web-simplification.ll     | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
index 26f77225dbb0e..3a99f535e9071 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING
-; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFH
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFHMIN
 ; Check that the default value enables the web folding and
 ; that it is bigger than 3.
@@ -23,6 +23,17 @@ define void @vfwmul_v2f116_multiple_users(ptr %x, ptr %y, ptr %z, <2 x half> %a,
 ; NO_FOLDING-NEXT:    vse32.v v8, (a2)
 ; NO_FOLDING-NEXT:    ret
 ;
+; ZVFH-LABEL: vfwmul_v2f116_multiple_users:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfwmul.vv v11, v8, v9
+; ZVFH-NEXT:    vfwadd.vv v12, v8, v10
+; ZVFH-NEXT:    vfwsub.vv v8, v9, v10
+; ZVFH-NEXT:    vse32.v v11, (a0)
+; ZVFH-NEXT:    vse32.v v12, (a1)
+; ZVFH-NEXT:    vse32.v v8, (a2)
+; ZVFH-NEXT:    ret
+;
 ; ZVFHMIN-LABEL: vfwmul_v2f116_multiple_users:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma

>From 0ee32c45730c94be1b7d5fa60a0e8dff5751d014 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni at amd.com>
Date: Wed, 24 Jul 2024 14:34:31 -0500
Subject: [PATCH 065/259] [AMDGPU] Implement llvm.lrint intrinsic lowering
 (#98931)

This patch enabled the target-independent lowering of llvm.lrint via
GlobalISel.
For SelectionDAG, the instrinsic is custom lowered for AMDGPU.
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  13 +
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  10 +
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |  18 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   3 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   5 +
 llvm/test/CodeGen/AMDGPU/lrint.ll             | 771 ++++++++++++++++++
 6 files changed, 811 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/lrint.ll

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 6c7885c491f41..b490ab2fc7617 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3900,6 +3900,17 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
     changeOpcode(MI, TargetOpcode::G_INTRINSIC_ROUNDEVEN);
     return Legalized;
   }
+  case TargetOpcode::G_INTRINSIC_LRINT:
+  case TargetOpcode::G_INTRINSIC_LLRINT: {
+    Register DstReg = MI.getOperand(0).getReg();
+    Register SrcReg = MI.getOperand(1).getReg();
+    LLT SrcTy = MRI.getType(SrcReg);
+    auto Round =
+        MIRBuilder.buildInstr(TargetOpcode::G_FRINT, {SrcTy}, {SrcReg});
+    MIRBuilder.buildFPTOSI(DstReg, Round);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
     auto [OldValRes, SuccessRes, Addr, CmpVal, NewVal] = MI.getFirst5Regs();
     Register NewOldValRes = MRI.cloneVirtualRegister(OldValRes);
@@ -4755,6 +4766,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_FCEIL:
   case G_FFLOOR:
   case G_FRINT:
+  case G_INTRINSIC_LRINT:
+  case G_INTRINSIC_LLRINT:
   case G_INTRINSIC_ROUND:
   case G_INTRINSIC_ROUNDEVEN:
   case G_INTRINSIC_TRUNC:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 3a39f6a4d2b4a..bdb7917073020 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4336,6 +4336,16 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     // targets where it is not needed.
     Results.push_back(Node->getOperand(0));
     break;
+  case ISD::LRINT:
+  case ISD::LLRINT: {
+    SDValue Arg = Node->getOperand(0);
+    EVT ArgVT = Arg.getValueType();
+    EVT ResVT = Node->getValueType(0);
+    SDLoc dl(Node);
+    SDValue RoundNode = DAG.getNode(ISD::FRINT, dl, ArgVT, Arg);
+    Results.push_back(DAG.getNode(ISD::FP_TO_SINT, dl, ResVT, RoundNode));
+    break;
+  }
   case ISD::GLOBAL_OFFSET_TABLE:
   case ISD::GlobalAddress:
   case ISD::GlobalTLSAddress:
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index bc5fc9659469d..6ca9955993d24 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -824,15 +824,15 @@ void TargetLoweringBase::initActions() {
                      Expand);
 
   // These library functions default to expand.
-  setOperationAction(
-      {ISD::FCBRT, ISD::FLOG,   ISD::FLOG2,  ISD::FLOG10,     ISD::FEXP,
-       ISD::FEXP2, ISD::FEXP10, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
-       ISD::FRINT, ISD::FTRUNC, ISD::LRINT,  ISD::LLRINT,     ISD::FROUNDEVEN,
-       ISD::FTAN,  ISD::FACOS,  ISD::FASIN,  ISD::FATAN,      ISD::FCOSH,
-       ISD::FSINH, ISD::FTANH},
-      {MVT::f32, MVT::f64, MVT::f128}, Expand);
-
-  setOperationAction({ISD::LROUND, ISD::LLROUND},
+  setOperationAction({ISD::FCBRT,      ISD::FLOG,  ISD::FLOG2,  ISD::FLOG10,
+                      ISD::FEXP,       ISD::FEXP2, ISD::FEXP10, ISD::FFLOOR,
+                      ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT,  ISD::FTRUNC,
+                      ISD::FROUNDEVEN, ISD::FTAN,  ISD::FACOS,  ISD::FASIN,
+                      ISD::FATAN,      ISD::FCOSH, ISD::FSINH,  ISD::FTANH},
+                     {MVT::f32, MVT::f64, MVT::f128}, Expand);
+
+  // FIXME: Query RuntimeLibCalls to make the decision.
+  setOperationAction({ISD::LRINT, ISD::LLRINT, ISD::LROUND, ISD::LLROUND},
                      {MVT::f32, MVT::f64, MVT::f128}, LibCall);
 
   setOperationAction({ISD::FTAN, ISD::FACOS, ISD::FASIN, ISD::FATAN, ISD::FCOSH,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index af0ffc8fc7d07..ec386000302f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -408,6 +408,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
 
+  setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
+                     Expand);
+
   setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
 
   if (Subtarget->has16BitInsts())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index bdec2c54b4450..c6c4b8f930647 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1146,6 +1146,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .scalarize(0)
       .lower();
 
+  getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
+      .clampScalar(0, S16, S64)
+      .scalarize(0)
+      .lower();
+
   if (ST.has16BitInsts()) {
     getActionDefinitionsBuilder(
         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
diff --git a/llvm/test/CodeGen/AMDGPU/lrint.ll b/llvm/test/CodeGen/AMDGPU/lrint.ll
new file mode 100644
index 0000000000000..31e6cf6ea645c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lrint.ll
@@ -0,0 +1,771 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s
+
+declare float @llvm.rint.f32(float)
+declare i32 @llvm.lrint.i32.f32(float)
+declare i32 @llvm.lrint.i32.f64(double)
+declare i64 @llvm.lrint.i64.f32(float)
+declare i64 @llvm.lrint.i64.f64(double)
+declare i64 @llvm.llrint.i64.f32(float)
+declare half @llvm.rint.f16(half)
+declare i32 @llvm.lrint.i32.f16(half %arg)
+declare <2 x float> @llvm.rint.v2f32.v2f32(<2 x float> %arg)
+declare <2 x i32> @llvm.lrint.v2i32.v2f32(<2 x float> %arg)
+declare <2 x i64> @llvm.lrint.v2i64.v2f32(<2 x float> %arg)
+
+define float @intrinsic_frint(float %arg) {
+; GCN-LABEL: intrinsic_frint:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_rndne_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call float @llvm.rint.f32(float %arg)
+  ret float %res
+}
+
+define i32 @intrinsic_lrint_i32_f32(float %arg) {
+; GFX9-LABEL: intrinsic_lrint_i32_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lrint_i32_f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lrint_i32_f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i32 @llvm.lrint.i32.f32(float %arg)
+  ret i32 %res
+}
+
+define i32 @intrinsic_lrint_i32_f64(double %arg) {
+; GFX9-LABEL: intrinsic_lrint_i32_f64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX9-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lrint_i32_f64:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX10-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lrint_i32_f64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i32 @llvm.lrint.i32.f64(double %arg)
+  ret i32 %res
+}
+
+define i64 @intrinsic_lrint_i64_f32(float %arg) {
+; GFX9-SDAG-LABEL: intrinsic_lrint_i64_f32:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX9-SDAG-NEXT:    v_mul_f32_e64 v1, |v0|, s4
+; GFX9-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0xcf800000
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX9-SDAG-NEXT:    v_fma_f32 v1, v1, s4, |v0|
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v2, v2, v3
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX9-SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v3, vcc
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lrint_i64_f32:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; GFX9-GISEL-NEXT:    v_mul_f32_e64 v2, |v1|, v2
+; GFX9-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcf800000
+; GFX9-GISEL-NEXT:    v_fma_f32 v1, v2, v3, |v1|
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lrint_i64_f32:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10-SDAG-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX10-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX10-SDAG-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lrint_i64_f32:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-GISEL-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX10-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX10-GISEL-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lrint_i64_f32:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX11-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lrint_i64_f32:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-GISEL-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX11-GISEL-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i64 @llvm.lrint.i64.f32(float %arg)
+  ret i64 %res
+}
+
+define i64 @intrinsic_lrint_i64_f64(double %arg) {
+; GFX9-SDAG-LABEL: intrinsic_lrint_i64_f64:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX9-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xc1f00000
+; GFX9-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
+; GFX9-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lrint_i64_f64:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3df00000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc1f00000
+; GFX9-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lrint_i64_f64:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX10-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX10-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lrint_i64_f64:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX10-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX10-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lrint_i64_f64:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX11-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lrint_i64_f64:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i64 @llvm.lrint.i64.f64(double %arg)
+  ret i64 %res
+}
+
+define i64 @intrinsic_llrint_i64_f32(float %arg) {
+; GFX9-SDAG-LABEL: intrinsic_llrint_i64_f32:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX9-SDAG-NEXT:    v_mul_f32_e64 v1, |v0|, s4
+; GFX9-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0xcf800000
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX9-SDAG-NEXT:    v_fma_f32 v1, v1, s4, |v0|
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v2, v2, v3
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX9-SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v3, vcc
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_llrint_i64_f32:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; GFX9-GISEL-NEXT:    v_mul_f32_e64 v2, |v1|, v2
+; GFX9-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcf800000
+; GFX9-GISEL-NEXT:    v_fma_f32 v1, v2, v3, |v1|
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_llrint_i64_f32:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10-SDAG-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX10-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX10-SDAG-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_llrint_i64_f32:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-GISEL-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX10-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX10-GISEL-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_llrint_i64_f32:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX11-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_llrint_i64_f32:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-GISEL-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX11-GISEL-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i64 @llvm.llrint.i64.f32(float %arg)
+  ret i64 %res
+}
+
+define i64 @intrinsic_llrint_i64_f64(double %arg) {
+; GFX9-SDAG-LABEL: intrinsic_llrint_i64_f64:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX9-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xc1f00000
+; GFX9-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
+; GFX9-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_llrint_i64_f64:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3df00000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc1f00000
+; GFX9-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_llrint_i64_f64:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX10-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX10-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_llrint_i64_f64:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX10-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX10-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_llrint_i64_f64:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX11-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_llrint_i64_f64:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i64 @llvm.llrint.i64.f64(double %arg)
+  ret i64 %res
+}
+
+define half @intrinsic_frint_half(half %arg) {
+; GCN-LABEL: intrinsic_frint_half:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_rndne_f16_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call half @llvm.rint.f16(half %arg)
+  ret half %res
+}
+
+define i32 @intrinsic_lrint_i32_f16(half %arg) {
+; GFX9-LABEL: intrinsic_lrint_i32_f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lrint_i32_f16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lrint_i32_f16:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call i32 @llvm.lrint.i32.f16(half %arg)
+  ret i32 %res
+}
+
+define <2 x float> @intrinsic_frint_v2f32_v2f32(<2 x float> %arg) {
+; GCN-LABEL: intrinsic_frint_v2f32_v2f32:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_rndne_f32_e32 v0, v0
+; GCN-NEXT:    v_rndne_f32_e32 v1, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call <2 x float> @llvm.rint.v2f32.v2f32(<2 x float> %arg)
+  ret <2 x float> %res
+}
+
+define <2 x i32> @intrinsic_lrint_v2i32_v2f32(<2 x float> %arg) {
+; GFX9-LABEL: intrinsic_lrint_v2i32_v2f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lrint_v2i32_v2f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lrint_v2i32_v2f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call <2 x i32> @llvm.lrint.v2i32.v2f32(<2 x float> %arg)
+  ret <2 x i32> %res
+}
+
+define <2 x i64> @intrinsic_lrint_v2i64_v2f32(<2 x float> %arg) {
+; GFX9-SDAG-LABEL: intrinsic_lrint_v2i64_v2f32:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX9-SDAG-NEXT:    v_mul_f32_e64 v2, |v0|, s4
+; GFX9-SDAG-NEXT:    v_floor_f32_e32 v2, v2
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xcf800000
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX9-SDAG-NEXT:    v_fma_f32 v2, v2, s5, |v0|
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-SDAG-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v3, v3, v4
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v0, v2, v4
+; GFX9-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX9-SDAG-NEXT:    v_mul_f32_e64 v1, |v2|, s4
+; GFX9-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v5, v1
+; GFX9-SDAG-NEXT:    v_fma_f32 v1, v1, s5, |v2|
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v6, v1
+; GFX9-SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9-SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v2, v6, v3
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v4, v5, v3
+; GFX9-SDAG-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX9-SDAG-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lrint_v2i64_v2f32:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x2f800000
+; GFX9-GISEL-NEXT:    v_mul_f32_e64 v4, |v2|, v3
+; GFX9-GISEL-NEXT:    v_floor_f32_e32 v4, v4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xcf800000
+; GFX9-GISEL-NEXT:    v_fma_f32 v2, v4, v5, |v2|
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v2, v6
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, v4, v6
+; GFX9-GISEL-NEXT:    v_rndne_f32_e32 v4, v1
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v1, v4
+; GFX9-GISEL-NEXT:    v_mul_f32_e64 v3, |v1|, v3
+; GFX9-GISEL-NEXT:    v_floor_f32_e32 v3, v3
+; GFX9-GISEL-NEXT:    v_fma_f32 v1, v3, v5, |v1|
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v1
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v6
+; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
+; GFX9-GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v6, vcc
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, v5, v4
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
+; GFX9-GISEL-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v4
+; GFX9-GISEL-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lrint_v2i64_v2f32:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10-SDAG-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v0|
+; GFX10-SDAG-NEXT:    v_mul_f32_e64 v3, 0x2f800000, |v1|
+; GFX10-SDAG-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
+; GFX10-SDAG-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; GFX10-SDAG-NEXT:    v_floor_f32_e32 v2, v2
+; GFX10-SDAG-NEXT:    v_floor_f32_e32 v3, v3
+; GFX10-SDAG-NEXT:    v_fma_f32 v4, 0xcf800000, v2, |v0|
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, 0xcf800000, v3, |v1|
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v4
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v2, v2, v5
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v3, v3, v6
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v4, v0, v6
+; GFX10-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v1, v5
+; GFX10-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_sub_co_u32 v2, vcc_lo, v4, v6
+; GFX10-SDAG-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lrint_v2i64_v2f32:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10-GISEL-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
+; GFX10-GISEL-NEXT:    v_mul_f32_e64 v4, 0x2f800000, |v2|
+; GFX10-GISEL-NEXT:    v_mul_f32_e64 v5, 0x2f800000, |v3|
+; GFX10-GISEL-NEXT:    v_floor_f32_e32 v4, v4
+; GFX10-GISEL-NEXT:    v_floor_f32_e32 v5, v5
+; GFX10-GISEL-NEXT:    v_fma_f32 v2, 0xcf800000, v4, |v2|
+; GFX10-GISEL-NEXT:    v_fma_f32 v0, 0xcf800000, v5, |v3|
+; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v4
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v5
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v5, v0, v3
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v4, v4, v3
+; GFX10-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v1, v6
+; GFX10-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v6, vcc_lo
+; GFX10-GISEL-NEXT:    v_sub_co_u32 v2, vcc_lo, v5, v3
+; GFX10-GISEL-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v3, vcc_lo
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lrint_v2i64_v2f32:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11-SDAG-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v0|
+; GFX11-SDAG-NEXT:    v_mul_f32_e64 v3, 0x2f800000, |v1|
+; GFX11-SDAG-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
+; GFX11-SDAG-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_floor_f32_e32 v2, v2
+; GFX11-SDAG-NEXT:    v_floor_f32_e32 v3, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_fma_f32 v4, 0xcf800000, v2, |v0|
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, 0xcf800000, v3, |v1|
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v4
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v2, v2, v5
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v3, v3, v6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v4, v0, v6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v1, v5
+; GFX11-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_sub_co_u32 v2, vcc_lo, v4, v6
+; GFX11-SDAG-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lrint_v2i64_v2f32:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11-GISEL-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX11-GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_mul_f32_e64 v4, 0x2f800000, |v2|
+; GFX11-GISEL-NEXT:    v_mul_f32_e64 v5, 0x2f800000, |v3|
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_floor_f32_e32 v4, v4
+; GFX11-GISEL-NEXT:    v_floor_f32_e32 v5, v5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_fma_f32 v2, 0xcf800000, v4, |v2|
+; GFX11-GISEL-NEXT:    v_fma_f32 v0, 0xcf800000, v5, |v3|
+; GFX11-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v4
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v5, v0, v3
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v4, v4, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v1, v6
+; GFX11-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v6, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_sub_co_u32 v2, vcc_lo, v5, v3
+; GFX11-GISEL-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v3, vcc_lo
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %res = tail call <2 x i64> @llvm.lrint.v2i64.v2f32(<2 x float> %arg)
+  ret <2 x i64> %res
+}
+

>From 20eff684203287828d6722fc860b9d3621429542 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma at quicinc.com>
Date: Wed, 24 Jul 2024 12:36:08 -0700
Subject: [PATCH 066/259] [ExprConstant] Handle shift overflow the same way as
 other kinds of overflow (#99579)

We have a mechanism to allow folding expressions that aren't ICEs as an
extension; use it more consistently.

This ends up causing bad effects on diagnostics in a few cases, but
that's not specific to shifts; it's a general issue with the way those
uses handle overflow diagnostics.
---
 clang/docs/ReleaseNotes.rst              |  2 ++
 clang/lib/AST/ExprConstant.cpp           | 26 ++++++++++++++++--------
 clang/lib/AST/Interp/Interp.h            | 22 ++++++++++++--------
 clang/lib/Sema/SemaExpr.cpp              |  3 ++-
 clang/test/CXX/basic/basic.types/p10.cpp |  2 +-
 clang/test/Sema/constant-builtins-2.c    | 12 ++++-------
 clang/test/SemaCXX/class.cpp             |  9 +++++++-
 clang/test/SemaCXX/enum.cpp              | 24 +++++++++++++++-------
 8 files changed, 65 insertions(+), 35 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 65de90f69e198..7a09822a1dd52 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -126,6 +126,8 @@ Improvements to Clang's diagnostics
 
 - Clang now diagnoses dangling references to fields of temporary objects. Fixes #GH81589.
 
+- Clang now diagnoses undefined behavior in constant expressions more consistently. This includes invalid shifts, and signed overflow in arithmetic.
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index c5c86dee04c5b..558e20ed3e423 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -2843,6 +2843,8 @@ static bool handleIntIntBinOp(EvalInfo &Info, const BinaryOperator *E,
       // During constant-folding, a negative shift is an opposite shift. Such
       // a shift is not a constant expression.
       Info.CCEDiag(E, diag::note_constexpr_negative_shift) << RHS;
+      if (!Info.noteUndefinedBehavior())
+        return false;
       RHS = -RHS;
       goto shift_right;
     }
@@ -2853,19 +2855,23 @@ static bool handleIntIntBinOp(EvalInfo &Info, const BinaryOperator *E,
     if (SA != RHS) {
       Info.CCEDiag(E, diag::note_constexpr_large_shift)
         << RHS << E->getType() << LHS.getBitWidth();
+      if (!Info.noteUndefinedBehavior())
+        return false;
     } else if (LHS.isSigned() && !Info.getLangOpts().CPlusPlus20) {
       // C++11 [expr.shift]p2: A signed left shift must have a non-negative
       // operand, and must not overflow the corresponding unsigned type.
       // C++2a [expr.shift]p2: E1 << E2 is the unique value congruent to
       // E1 x 2^E2 module 2^N.
-      if (LHS.isNegative())
+      if (LHS.isNegative()) {
         Info.CCEDiag(E, diag::note_constexpr_lshift_of_negative) << LHS;
-      else if (LHS.countl_zero() < SA)
+        if (!Info.noteUndefinedBehavior())
+          return false;
+      } else if (LHS.countl_zero() < SA) {
         Info.CCEDiag(E, diag::note_constexpr_lshift_discards);
+        if (!Info.noteUndefinedBehavior())
+          return false;
+      }
     }
-    if (Info.EvalStatus.Diag && !Info.EvalStatus.Diag->empty() &&
-        Info.getLangOpts().CPlusPlus11)
-      return false;
     Result = LHS << SA;
     return true;
   }
@@ -2879,6 +2885,8 @@ static bool handleIntIntBinOp(EvalInfo &Info, const BinaryOperator *E,
       // During constant-folding, a negative shift is an opposite shift. Such a
       // shift is not a constant expression.
       Info.CCEDiag(E, diag::note_constexpr_negative_shift) << RHS;
+      if (!Info.noteUndefinedBehavior())
+        return false;
       RHS = -RHS;
       goto shift_left;
     }
@@ -2886,13 +2894,13 @@ static bool handleIntIntBinOp(EvalInfo &Info, const BinaryOperator *E,
     // C++11 [expr.shift]p1: Shift width must be less than the bit width of the
     // shifted type.
     unsigned SA = (unsigned) RHS.getLimitedValue(LHS.getBitWidth()-1);
-    if (SA != RHS)
+    if (SA != RHS) {
       Info.CCEDiag(E, diag::note_constexpr_large_shift)
         << RHS << E->getType() << LHS.getBitWidth();
+      if (!Info.noteUndefinedBehavior())
+        return false;
+    }
 
-    if (Info.EvalStatus.Diag && !Info.EvalStatus.Diag->empty() &&
-        Info.getLangOpts().CPlusPlus11)
-      return false;
     Result = LHS >> SA;
     return true;
   }
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 492802897f013..96c59dd14c04a 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -153,7 +153,8 @@ bool CheckShift(InterpState &S, CodePtr OpPC, const LT &LHS, const RT &RHS,
   if (RHS.isNegative()) {
     const SourceInfo &Loc = S.Current->getSource(OpPC);
     S.CCEDiag(Loc, diag::note_constexpr_negative_shift) << RHS.toAPSInt();
-    return false;
+    if (!S.noteUndefinedBehavior())
+      return false;
   }
 
   // C++11 [expr.shift]p1: Shift width must be less than the bit width of
@@ -163,17 +164,24 @@ bool CheckShift(InterpState &S, CodePtr OpPC, const LT &LHS, const RT &RHS,
     const APSInt Val = RHS.toAPSInt();
     QualType Ty = E->getType();
     S.CCEDiag(E, diag::note_constexpr_large_shift) << Val << Ty << Bits;
-    return !(S.getEvalStatus().Diag && !S.getEvalStatus().Diag->empty() && S.getLangOpts().CPlusPlus11);
+    if (!S.noteUndefinedBehavior())
+      return false;
   }
 
   if (LHS.isSigned() && !S.getLangOpts().CPlusPlus20) {
     const Expr *E = S.Current->getExpr(OpPC);
     // C++11 [expr.shift]p2: A signed left shift must have a non-negative
     // operand, and must not overflow the corresponding unsigned type.
-    if (LHS.isNegative())
+    if (LHS.isNegative()) {
       S.CCEDiag(E, diag::note_constexpr_lshift_of_negative) << LHS.toAPSInt();
-    else if (LHS.toUnsigned().countLeadingZeros() < static_cast<unsigned>(RHS))
+      if (!S.noteUndefinedBehavior())
+        return false;
+    } else if (LHS.toUnsigned().countLeadingZeros() <
+               static_cast<unsigned>(RHS)) {
       S.CCEDiag(E, diag::note_constexpr_lshift_discards);
+      if (!S.noteUndefinedBehavior())
+        return false;
+    }
   }
 
   // C++2a [expr.shift]p2: [P0907R4]:
@@ -2269,8 +2277,7 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
     // shift is not a constant expression.
     const SourceInfo &Loc = S.Current->getSource(OpPC);
     S.CCEDiag(Loc, diag::note_constexpr_negative_shift) << RHS.toAPSInt();
-    if (S.getLangOpts().CPlusPlus11 && S.getEvalStatus().Diag &&
-        !S.getEvalStatus().Diag->empty())
+    if (!S.noteUndefinedBehavior())
       return false;
     RHS = -RHS;
     return DoShift < LT, RT,
@@ -2286,8 +2293,7 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
       // E1 x 2^E2 module 2^N.
       const SourceInfo &Loc = S.Current->getSource(OpPC);
       S.CCEDiag(Loc, diag::note_constexpr_lshift_of_negative) << LHS.toAPSInt();
-      if (S.getLangOpts().CPlusPlus11 && S.getEvalStatus().Diag &&
-          !S.getEvalStatus().Diag->empty())
+      if (!S.noteUndefinedBehavior())
         return false;
     }
   }
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 206194930f3b4..74c0e01705905 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -17045,7 +17045,8 @@ Sema::VerifyIntegerConstantExpression(Expr *E, llvm::APSInt *Result,
   // not a constant expression as a side-effect.
   bool Folded =
       E->EvaluateAsRValue(EvalResult, Context, /*isConstantContext*/ true) &&
-      EvalResult.Val.isInt() && !EvalResult.HasSideEffects;
+      EvalResult.Val.isInt() && !EvalResult.HasSideEffects &&
+      (!getLangOpts().CPlusPlus || !EvalResult.HasUndefinedBehavior);
 
   if (!isa<ConstantExpr>(E))
     E = ConstantExpr::Create(Context, E, EvalResult.Val);
diff --git a/clang/test/CXX/basic/basic.types/p10.cpp b/clang/test/CXX/basic/basic.types/p10.cpp
index a543f248e5371..92d6da0035ea5 100644
--- a/clang/test/CXX/basic/basic.types/p10.cpp
+++ b/clang/test/CXX/basic/basic.types/p10.cpp
@@ -142,7 +142,7 @@ constexpr int arb(int n) { // expected-note {{declared here}}
                expected-note {{function parameter 'n' with unknown value cannot be used in a constant expression}}
 }
 constexpr long Overflow[(1 << 30) << 2]{}; // expected-warning {{requires 34 bits to represent}} \
-                                              expected-warning {{variable length array folded to constant array as an extension}} \
+                                              expected-error {{variable length array declaration not allowed at file scope}} \
                                               expected-warning {{variable length arrays in C++ are a Clang extension}} \
                                               expected-note {{signed left shift discards bits}}
 
diff --git a/clang/test/Sema/constant-builtins-2.c b/clang/test/Sema/constant-builtins-2.c
index 00767267cd6c2..37b63cf4f6b32 100644
--- a/clang/test/Sema/constant-builtins-2.c
+++ b/clang/test/Sema/constant-builtins-2.c
@@ -265,10 +265,8 @@ char clz52[__builtin_clzg((unsigned __int128)0x1) == BITSIZE(__int128) - 1 ? 1 :
 char clz53[__builtin_clzg((unsigned __int128)0x1, 42) == BITSIZE(__int128) - 1 ? 1 : -1];
 char clz54[__builtin_clzg((unsigned __int128)0xf) == BITSIZE(__int128) - 4 ? 1 : -1];
 char clz55[__builtin_clzg((unsigned __int128)0xf, 42) == BITSIZE(__int128) - 4 ? 1 : -1];
-char clz56[__builtin_clzg((unsigned __int128)(1 << (BITSIZE(__int128) - 1))) == 0 ? 1 : -1]; // expected-warning {{variable length array folded to constant array as an extension}}
-                                                                                             // expected-note at -1 {{shift count 127 >= width of type 'int' (32 bits)}}
-char clz57[__builtin_clzg((unsigned __int128)(1 << (BITSIZE(__int128) - 1)), 42) == 0 ? 1 : -1]; // expected-warning {{variable length array folded to constant array as an extension}}
-                                                                                                 // expected-note at -1 {{shift count 127 >= width of type 'int' (32 bits)}}
+char clz56[__builtin_clzg((unsigned __int128)(1 << (BITSIZE(__int128) - 1))) == 0 ? 1 : -1]; // expected-error {{variable length array declaration not allowed at file scope}}
+char clz57[__builtin_clzg((unsigned __int128)(1 << (BITSIZE(__int128) - 1)), 42) == 0 ? 1 : -1]; // expected-error {{variable length array declaration not allowed at file scope}}
 #endif
 int clz58 = __builtin_clzg((unsigned _BitInt(128))0); // expected-error {{not a compile-time constant}}
 char clz59[__builtin_clzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
@@ -276,10 +274,8 @@ char clz60[__builtin_clzg((unsigned _BitInt(128))0x1) == BITSIZE(_BitInt(128)) -
 char clz61[__builtin_clzg((unsigned _BitInt(128))0x1, 42) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
 char clz62[__builtin_clzg((unsigned _BitInt(128))0xf) == BITSIZE(_BitInt(128)) - 4 ? 1 : -1];
 char clz63[__builtin_clzg((unsigned _BitInt(128))0xf, 42) == BITSIZE(_BitInt(128)) - 4 ? 1 : -1];
-char clz64[__builtin_clzg((unsigned _BitInt(128))(1 << (BITSIZE(_BitInt(128)) - 1))) == 0 ? 1 : -1]; // expected-warning {{variable length array folded to constant array as an extension}}
-                                                                                                     // expected-note at -1 {{shift count 127 >= width of type 'int' (32 bits)}}
-char clz65[__builtin_clzg((unsigned _BitInt(128))(1 << (BITSIZE(_BitInt(128)) - 1)), 42) == 0 ? 1 : -1]; // expected-warning {{variable length array folded to constant array as an extension}}
-                                                                                                         // expected-note at -1 {{shift count 127 >= width of type 'int' (32 bits)}}
+char clz64[__builtin_clzg((unsigned _BitInt(128))(1 << (BITSIZE(_BitInt(128)) - 1))) == 0 ? 1 : -1]; // expected-error {{variable length array declaration not allowed at file scope}}
+char clz65[__builtin_clzg((unsigned _BitInt(128))(1 << (BITSIZE(_BitInt(128)) - 1)), 42) == 0 ? 1 : -1]; // expected-error {{variable length array declaration not allowed at file scope}}
 
 char ctz1[__builtin_ctz(1) == 0 ? 1 : -1];
 char ctz2[__builtin_ctz(8) == 3 ? 1 : -1];
diff --git a/clang/test/SemaCXX/class.cpp b/clang/test/SemaCXX/class.cpp
index f874b7be2b70e..2f59544e7f36c 100644
--- a/clang/test/SemaCXX/class.cpp
+++ b/clang/test/SemaCXX/class.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -Wc++11-compat %s 
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx11 -Wc++11-compat %s
 // RUN: %clang_cc1 -fsyntax-only -verify -Wc++11-compat %s -std=c++98
 class C {
 public:
@@ -55,6 +55,13 @@ class C {
   // expected-error at -2 {{static const volatile data member must be initialized out of line}}
 #endif
   static const E evi = 0;
+  static const int overflow = 1000000*1000000; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
+                                               // expected-warning at -1 {{overflow in expression}}
+  static const int overflow_shift = 1<<32; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
+  static const int overflow_shift2 = 1>>32; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
+  static const int overflow_shift3 = 1<<-1; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
+  static const int overflow_shift4 = 1<<-1; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
+  static const int overflow_shift5 = -1<<1; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
 
   void m() {
     sx = 0;
diff --git a/clang/test/SemaCXX/enum.cpp b/clang/test/SemaCXX/enum.cpp
index 739d35ec4a06b..9c398cc8da886 100644
--- a/clang/test/SemaCXX/enum.cpp
+++ b/clang/test/SemaCXX/enum.cpp
@@ -103,14 +103,14 @@ void PR8089() {
 // This is accepted as a GNU extension. In C++98, there was no provision for
 // expressions with UB to be non-constant.
 enum { overflow = 123456 * 234567 };
-#if __cplusplus >= 201103L
-// expected-warning at -2 {{expression is not an integral constant expression; folding it to a constant is a GNU extension}}
-// expected-note at -3 {{value 28958703552 is outside the range of representable values of type 'int'}}
-#else
-// expected-error at -5 {{expression is not an integral constant expression}}
-// expected-note at -6 {{value 28958703552 is outside the range of representable values of type 'int'}}
-// expected-warning at -7 {{overflow in expression; result is -1'106'067'520 with type 'int'}}
+// expected-error at -1 {{expression is not an integral constant expression}}
+// expected-note at -2 {{value 28958703552 is outside the range of representable values of type 'int'}}
+#if __cplusplus < 201103L
+// expected-warning at -4 {{overflow in expression; result is -1'106'067'520 with type 'int'}}
 #endif
+enum { overflow_shift = 1 << 32 };
+// expected-error at -1 {{expression is not an integral constant expression}}
+// expected-note at -2 {{shift count 32 >= width of type 'int' (32 bits)}}
 
 // FIXME: This is not consistent with the above case.
 enum NoFold : int { overflow2 = 123456 * 234567 };
@@ -123,6 +123,16 @@ enum NoFold : int { overflow2 = 123456 * 234567 };
 // expected-error at -7 {{expression is not an integral constant expression}}
 // expected-note at -8 {{value 28958703552 is outside the range of representable values of type 'int'}}
 #endif
+enum : int { overflow2_shift = 1 << 32 };
+#if __cplusplus >= 201103L
+// expected-error at -2 {{enumerator value is not a constant expression}}
+// expected-note at -3 {{shift count 32 >= width of type 'int' (32 bits)}}
+#else
+// expected-error at -5 {{expression is not an integral constant expression}}
+// expected-note at -6 {{shift count 32 >= width of type 'int' (32 bits)}}
+// expected-warning at -7 {{enumeration types with a fixed underlying type are a C++11 extension}}
+#endif
+
 
 // PR28903
 struct PR28903 {

>From b1f263e4c2466a693609a3930f53b9887be67b5b Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs at apple.com>
Date: Wed, 24 Jul 2024 13:03:46 -0700
Subject: [PATCH 067/259] [llvm][MachineLICM] Fix a comment typo. NFC

---
 llvm/lib/CodeGen/MachineLICM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index f24ab187ef400..d75df2adc440d 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -476,7 +476,7 @@ static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI,
   RUs |= RUsFromRegsNotInMask;
 }
 
-/// Examine the instruction for potentai LICM candidate. Also
+/// Examine the instruction for potential LICM candidate. Also
 /// gather register def and frame object update information.
 void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &RUDefs,
                                 BitVector &RUClobbers,

>From bfcfb0fd2dea5e06f74e49ff8b4e1dc10c9acf6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Wed, 24 Jul 2024 22:13:52 +0200
Subject: [PATCH 068/259] [GlobalIsel] Modernize truncate of ext. (#100338)

Credits:
https://github.com/llvm/llvm-project/pull/90964
https://reviews.llvm.org/D87050

combine-trunc.mir

Functional changes intended.
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |  10 +-
 .../CodeGen/GlobalISel/GenericMachineInstrs.h |  15 ++
 .../include/llvm/Target/GlobalISel/Combine.td |  32 ++--
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  34 ----
 .../GlobalISel/CombinerHelperCasts.cpp        |  48 +++++
 .../AArch64/GlobalISel/arm64-atomic.ll        |  48 ++---
 .../AArch64/GlobalISel/arm64-pcsections.ll    |  40 ++--
 .../AArch64/GlobalISel/combine-trunc.mir      | 179 ++++++++++++++----
 8 files changed, 274 insertions(+), 132 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 37a56e12efcc3..47365c3be3b93 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -390,12 +390,6 @@ class CombinerHelper {
   void applyCombineExtOfExt(MachineInstr &MI,
                             std::tuple<Register, unsigned> &MatchInfo);
 
-  /// Transform trunc ([asz]ext x) to x or ([asz]ext x) or (trunc x).
-  bool matchCombineTruncOfExt(MachineInstr &MI,
-                              std::pair<Register, unsigned> &MatchInfo);
-  void applyCombineTruncOfExt(MachineInstr &MI,
-                              std::pair<Register, unsigned> &MatchInfo);
-
   /// Transform trunc (shl x, K) to shl (trunc x), K
   ///    if K < VT.getScalarSizeInBits().
   ///
@@ -886,6 +880,10 @@ class CombinerHelper {
 
   bool matchShlOfVScale(const MachineOperand &MO, BuildFnTy &MatchInfo);
 
+  /// Transform trunc ([asz]ext x) to x or ([asz]ext x) or (trunc x).
+  bool matchTruncateOfExt(const MachineInstr &Root, const MachineInstr &ExtMI,
+                          BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 80897953156a0..8b7e8c0fbf1f5 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -919,6 +919,21 @@ class GSUCmp : public GenericMachineInstr {
   };
 };
 
+/// Represents an integer-like extending operation.
+class GExtOp : public GCastOp {
+public:
+  static bool classof(const MachineInstr *MI) {
+    switch (MI->getOpcode()) {
+    case TargetOpcode::G_SEXT:
+    case TargetOpcode::G_ZEXT:
+    case TargetOpcode::G_ANYEXT:
+      return true;
+    default:
+      return false;
+    }
+  };
+};
+
 } // namespace llvm
 
 #endif // LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 3ef0636ebf1c7..2362e77b54be2 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -839,15 +839,6 @@ def unmerge_zext_to_zext : GICombineRule<
   (apply [{ Helper.applyCombineUnmergeZExtToZExt(*${d}); }])
 >;
 
-// Fold trunc ([asz]ext x) -> x or ([asz]ext x) or (trunc x).
-def trunc_ext_fold_matchinfo : GIDefMatchData<"std::pair<Register, unsigned>">;
-def trunc_ext_fold: GICombineRule <
-  (defs root:$root, trunc_ext_fold_matchinfo:$matchinfo),
-  (match (wip_match_opcode G_TRUNC):$root,
-         [{ return Helper.matchCombineTruncOfExt(*${root}, ${matchinfo}); }]),
-  (apply [{ Helper.applyCombineTruncOfExt(*${root}, ${matchinfo}); }])
->;
-
 // Under certain conditions, transform:
 //  trunc (shl x, K)     -> shl (trunc x), K//
 //  trunc ([al]shr x, K) -> (trunc ([al]shr (trunc x), K))
@@ -1768,6 +1759,25 @@ def freeze_combines: GICombineGroup<[
   push_freeze_to_prevent_poison_from_propagating
 ]>;
 
+/// Transform trunc ([asz]ext x) to x or ([asz]ext x) or (trunc x).
+class truncate_of_opcode<Instruction extOpcode> : GICombineRule <
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (extOpcode $ext, $src):$ExtMI,
+         (G_TRUNC $root, $ext):$root,
+         [{ return Helper.matchTruncateOfExt(*${root}, *${ExtMI}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
+def truncate_of_zext : truncate_of_opcode<G_ZEXT>;
+def truncate_of_sext : truncate_of_opcode<G_SEXT>;
+def truncate_of_anyext : truncate_of_opcode<G_ANYEXT>;
+
+def cast_combines: GICombineGroup<[
+  truncate_of_zext,
+  truncate_of_sext,
+  truncate_of_anyext
+]>;
+
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                      undef_to_negative_one,
@@ -1828,7 +1838,7 @@ def constant_fold_binops : GICombineGroup<[constant_fold_binop,
 def prefer_sign_combines : GICombineGroup<[nneg_zext]>;
 
 def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
-    vector_ops_combines, freeze_combines,
+    vector_ops_combines, freeze_combines, cast_combines,
     insert_vec_elt_combines, extract_vec_elt_combines, combines_for_extload,
     combine_extracted_vector_load,
     undef_combines, identity_combines, phi_combines,
@@ -1839,7 +1849,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
     known_bits_simplifications, ext_ext_fold,
     not_cmp_fold, opt_brcond_by_inverting_cond,
     unmerge_merge, unmerge_cst, unmerge_dead_to_trunc,
-    unmerge_zext_to_zext, merge_unmerge, trunc_ext_fold, trunc_shift,
+    unmerge_zext_to_zext, merge_unmerge, trunc_shift,
     const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,
     shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine,
     div_rem_to_divrem, funnel_shift_combines, bitreverse_shift, commute_shift,
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 208f554eb8f98..8c05931812af5 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -2582,40 +2582,6 @@ void CombinerHelper::applyCombineExtOfExt(
   }
 }
 
-bool CombinerHelper::matchCombineTruncOfExt(
-    MachineInstr &MI, std::pair<Register, unsigned> &MatchInfo) {
-  assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected a G_TRUNC");
-  Register SrcReg = MI.getOperand(1).getReg();
-  MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
-  unsigned SrcOpc = SrcMI->getOpcode();
-  if (SrcOpc == TargetOpcode::G_ANYEXT || SrcOpc == TargetOpcode::G_SEXT ||
-      SrcOpc == TargetOpcode::G_ZEXT) {
-    MatchInfo = std::make_pair(SrcMI->getOperand(1).getReg(), SrcOpc);
-    return true;
-  }
-  return false;
-}
-
-void CombinerHelper::applyCombineTruncOfExt(
-    MachineInstr &MI, std::pair<Register, unsigned> &MatchInfo) {
-  assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected a G_TRUNC");
-  Register SrcReg = MatchInfo.first;
-  unsigned SrcExtOp = MatchInfo.second;
-  Register DstReg = MI.getOperand(0).getReg();
-  LLT SrcTy = MRI.getType(SrcReg);
-  LLT DstTy = MRI.getType(DstReg);
-  if (SrcTy == DstTy) {
-    MI.eraseFromParent();
-    replaceRegWith(MRI, DstReg, SrcReg);
-    return;
-  }
-  if (SrcTy.getSizeInBits() < DstTy.getSizeInBits())
-    Builder.buildInstr(SrcExtOp, {DstReg}, {SrcReg});
-  else
-    Builder.buildTrunc(DstReg, SrcReg);
-  MI.eraseFromParent();
-}
-
 static LLT getMidVTForTruncRightShiftCombine(LLT ShiftTy, LLT TruncTy) {
   const unsigned ShiftSize = ShiftTy.getScalarSizeInBits();
   const unsigned TruncSize = TruncTy.getScalarSizeInBits();
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
index 8fe69f21fafd1..d36685bf28313 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
@@ -113,3 +113,51 @@ bool CombinerHelper::matchNonNegZext(const MachineOperand &MO,
 
   return false;
 }
+
+bool CombinerHelper::matchTruncateOfExt(const MachineInstr &Root,
+                                        const MachineInstr &ExtMI,
+                                        BuildFnTy &MatchInfo) {
+  const GTrunc *Trunc = cast<GTrunc>(&Root);
+  const GExtOp *Ext = cast<GExtOp>(&ExtMI);
+
+  if (!MRI.hasOneNonDBGUse(Ext->getReg(0)))
+    return false;
+
+  Register Dst = Trunc->getReg(0);
+  Register Src = Ext->getSrcReg();
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src);
+
+  if (SrcTy == DstTy) {
+    // The source and the destination are equally sized. We need to copy.
+    MatchInfo = [=](MachineIRBuilder &B) { B.buildCopy(Dst, Src); };
+
+    return true;
+  }
+
+  if (SrcTy.getScalarSizeInBits() < DstTy.getScalarSizeInBits()) {
+    // If the source is smaller than the destination, we need to extend.
+
+    if (!isLegalOrBeforeLegalizer({Ext->getOpcode(), {DstTy, SrcTy}}))
+      return false;
+
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildInstr(Ext->getOpcode(), {Dst}, {Src});
+    };
+
+    return true;
+  }
+
+  if (SrcTy.getScalarSizeInBits() > DstTy.getScalarSizeInBits()) {
+    // If the source is larger than the destination, then we need to truncate.
+
+    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {DstTy, SrcTy}}))
+      return false;
+
+    MatchInfo = [=](MachineIRBuilder &B) { B.buildTrunc(Dst, Src); };
+
+    return true;
+  }
+
+  return false;
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
index b619aac709d98..de3f323891a36 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
@@ -2659,9 +2659,9 @@ define i8 @atomicrmw_umin_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:  LBB35_1: ; %atomicrmw.start
 ; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NOLSE-O1-NEXT:    ldaxrb w8, [x0]
-; CHECK-NOLSE-O1-NEXT:    and w10, w8, #0xff
-; CHECK-NOLSE-O1-NEXT:    cmp w10, w9
-; CHECK-NOLSE-O1-NEXT:    csel w10, w10, w9, lo
+; CHECK-NOLSE-O1-NEXT:    and w8, w8, #0xff
+; CHECK-NOLSE-O1-NEXT:    cmp w8, w9
+; CHECK-NOLSE-O1-NEXT:    csel w10, w8, w9, lo
 ; CHECK-NOLSE-O1-NEXT:    stlxrb w11, w10, [x0]
 ; CHECK-NOLSE-O1-NEXT:    cbnz w11, LBB35_1
 ; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2674,9 +2674,9 @@ define i8 @atomicrmw_umin_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-OUTLINE-O1-NEXT:  LBB35_1: ; %atomicrmw.start
 ; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-OUTLINE-O1-NEXT:    ldaxrb w8, [x0]
-; CHECK-OUTLINE-O1-NEXT:    and w10, w8, #0xff
-; CHECK-OUTLINE-O1-NEXT:    cmp w10, w9
-; CHECK-OUTLINE-O1-NEXT:    csel w10, w10, w9, lo
+; CHECK-OUTLINE-O1-NEXT:    and w8, w8, #0xff
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w9
+; CHECK-OUTLINE-O1-NEXT:    csel w10, w8, w9, lo
 ; CHECK-OUTLINE-O1-NEXT:    stlxrb w11, w10, [x0]
 ; CHECK-OUTLINE-O1-NEXT:    cbnz w11, LBB35_1
 ; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2781,9 +2781,9 @@ define i8 @atomicrmw_umax_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:  LBB36_1: ; %atomicrmw.start
 ; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NOLSE-O1-NEXT:    ldxrb w8, [x0]
-; CHECK-NOLSE-O1-NEXT:    and w10, w8, #0xff
-; CHECK-NOLSE-O1-NEXT:    cmp w10, w9
-; CHECK-NOLSE-O1-NEXT:    csel w10, w10, w9, hi
+; CHECK-NOLSE-O1-NEXT:    and w8, w8, #0xff
+; CHECK-NOLSE-O1-NEXT:    cmp w8, w9
+; CHECK-NOLSE-O1-NEXT:    csel w10, w8, w9, hi
 ; CHECK-NOLSE-O1-NEXT:    stxrb w11, w10, [x0]
 ; CHECK-NOLSE-O1-NEXT:    cbnz w11, LBB36_1
 ; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2796,9 +2796,9 @@ define i8 @atomicrmw_umax_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-OUTLINE-O1-NEXT:  LBB36_1: ; %atomicrmw.start
 ; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-OUTLINE-O1-NEXT:    ldxrb w8, [x0]
-; CHECK-OUTLINE-O1-NEXT:    and w10, w8, #0xff
-; CHECK-OUTLINE-O1-NEXT:    cmp w10, w9
-; CHECK-OUTLINE-O1-NEXT:    csel w10, w10, w9, hi
+; CHECK-OUTLINE-O1-NEXT:    and w8, w8, #0xff
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w9
+; CHECK-OUTLINE-O1-NEXT:    csel w10, w8, w9, hi
 ; CHECK-OUTLINE-O1-NEXT:    stxrb w11, w10, [x0]
 ; CHECK-OUTLINE-O1-NEXT:    cbnz w11, LBB36_1
 ; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3714,9 +3714,9 @@ define i16 @atomicrmw_umin_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:  LBB45_1: ; %atomicrmw.start
 ; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NOLSE-O1-NEXT:    ldaxrh w8, [x0]
-; CHECK-NOLSE-O1-NEXT:    and w10, w8, #0xffff
-; CHECK-NOLSE-O1-NEXT:    cmp w10, w9
-; CHECK-NOLSE-O1-NEXT:    csel w10, w10, w9, lo
+; CHECK-NOLSE-O1-NEXT:    and w8, w8, #0xffff
+; CHECK-NOLSE-O1-NEXT:    cmp w8, w9
+; CHECK-NOLSE-O1-NEXT:    csel w10, w8, w9, lo
 ; CHECK-NOLSE-O1-NEXT:    stlxrh w11, w10, [x0]
 ; CHECK-NOLSE-O1-NEXT:    cbnz w11, LBB45_1
 ; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3729,9 +3729,9 @@ define i16 @atomicrmw_umin_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-OUTLINE-O1-NEXT:  LBB45_1: ; %atomicrmw.start
 ; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-OUTLINE-O1-NEXT:    ldaxrh w8, [x0]
-; CHECK-OUTLINE-O1-NEXT:    and w10, w8, #0xffff
-; CHECK-OUTLINE-O1-NEXT:    cmp w10, w9
-; CHECK-OUTLINE-O1-NEXT:    csel w10, w10, w9, lo
+; CHECK-OUTLINE-O1-NEXT:    and w8, w8, #0xffff
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w9
+; CHECK-OUTLINE-O1-NEXT:    csel w10, w8, w9, lo
 ; CHECK-OUTLINE-O1-NEXT:    stlxrh w11, w10, [x0]
 ; CHECK-OUTLINE-O1-NEXT:    cbnz w11, LBB45_1
 ; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3836,9 +3836,9 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-NEXT:  LBB46_1: ; %atomicrmw.start
 ; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NOLSE-O1-NEXT:    ldxrh w8, [x0]
-; CHECK-NOLSE-O1-NEXT:    and w10, w8, #0xffff
-; CHECK-NOLSE-O1-NEXT:    cmp w10, w9
-; CHECK-NOLSE-O1-NEXT:    csel w10, w10, w9, hi
+; CHECK-NOLSE-O1-NEXT:    and w8, w8, #0xffff
+; CHECK-NOLSE-O1-NEXT:    cmp w8, w9
+; CHECK-NOLSE-O1-NEXT:    csel w10, w8, w9, hi
 ; CHECK-NOLSE-O1-NEXT:    stxrh w11, w10, [x0]
 ; CHECK-NOLSE-O1-NEXT:    cbnz w11, LBB46_1
 ; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3851,9 +3851,9 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-OUTLINE-O1-NEXT:  LBB46_1: ; %atomicrmw.start
 ; CHECK-OUTLINE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-OUTLINE-O1-NEXT:    ldxrh w8, [x0]
-; CHECK-OUTLINE-O1-NEXT:    and w10, w8, #0xffff
-; CHECK-OUTLINE-O1-NEXT:    cmp w10, w9
-; CHECK-OUTLINE-O1-NEXT:    csel w10, w10, w9, hi
+; CHECK-OUTLINE-O1-NEXT:    and w8, w8, #0xffff
+; CHECK-OUTLINE-O1-NEXT:    cmp w8, w9
+; CHECK-OUTLINE-O1-NEXT:    csel w10, w8, w9, hi
 ; CHECK-OUTLINE-O1-NEXT:    stxrh w11, w10, [x0]
 ; CHECK-OUTLINE-O1-NEXT:    cbnz w11, LBB46_1
 ; CHECK-OUTLINE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
index c8d313cf31afd..c6819ff39ed33 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
@@ -926,16 +926,16 @@ define i8 @atomicrmw_umin_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 7
-  ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 3, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = ANDWri renamable $w8, 7, implicit killed $x8
+  ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 3, implicit killed $nzcv, implicit-def $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRB renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
-  ; CHECK-NEXT:   liveins: $x8
+  ; CHECK-NEXT:   liveins: $w8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0
   ; CHECK-NEXT:   RET undef $lr, implicit $w0
   %res = atomicrmw umin ptr %ptr, i8 %rhs seq_cst, !pcsections !0
   ret i8 %res
@@ -954,16 +954,16 @@ define i8 @atomicrmw_umax_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 7
-  ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = ANDWri renamable $w8, 7, implicit killed $x8
+  ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STXRB renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
-  ; CHECK-NEXT:   liveins: $x8
+  ; CHECK-NEXT:   liveins: $w8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0
   ; CHECK-NEXT:   RET undef $lr, implicit $w0
   %res = atomicrmw umax ptr %ptr, i8 %rhs monotonic, !pcsections !0
   ret i8 %res
@@ -1179,16 +1179,16 @@ define i16 @atomicrmw_umin_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 15
-  ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 3, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = ANDWri renamable $w8, 15, implicit killed $x8
+  ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 3, implicit killed $nzcv, implicit-def $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRH renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
-  ; CHECK-NEXT:   liveins: $x8
+  ; CHECK-NEXT:   liveins: $w8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0
   ; CHECK-NEXT:   RET undef $lr, implicit $w0
   %res = atomicrmw umin ptr %ptr, i16 %rhs seq_cst, !pcsections !0
   ret i16 %res
@@ -1207,16 +1207,16 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 15
-  ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = ANDWri renamable $w8, 15, implicit killed $x8
+  ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STXRH renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
-  ; CHECK-NEXT:   liveins: $x8
+  ; CHECK-NEXT:   liveins: $w8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, killed $w8, 0
   ; CHECK-NEXT:   RET undef $lr, implicit $w0
   %res = atomicrmw umax ptr %ptr, i16 %rhs monotonic, !pcsections !0
   ret i16 %res
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-trunc.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-trunc.mir
index 9f1d403340f49..4a38b5d4c63dd 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-trunc.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-trunc.mir
@@ -1,36 +1,51 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs  %s | FileCheck %s
+# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs  %s | FileCheck %s --check-prefixes=CHECK,CHECK-PRE
+# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-postlegalizer-combiner -verify-machineinstrs  %s | FileCheck %s --check-prefixes=CHECK,CHECK-POST
+
 ---
 name:            test_combine_trunc_undef
+legalized: true
 body:             |
   bb.1:
     ; CHECK-LABEL: name: test_combine_trunc_undef
     ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CHECK: $w0 = COPY [[DEF]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[DEF]](s32)
     %0:_(s64) = G_IMPLICIT_DEF
     %1:_(s32) = G_TRUNC %0(s64)
     $w0 = COPY %1(s32)
 ...
 ---
 name:            test_combine_trunc_undef_vec
+legalized: true
 body:             |
   bb.1:
     ; CHECK-LABEL: name: test_combine_trunc_undef_vec
     ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
-    ; CHECK: $x0 = COPY [[DEF]](<2 x s32>)
+    ; CHECK-NEXT: $x0 = COPY [[DEF]](<2 x s32>)
     %0:_(<2 x s64>) = G_IMPLICIT_DEF
     %1:_(<2 x s32>) = G_TRUNC %0(<2 x s64>)
     $x0 = COPY %1(<2 x s32>)
 ...
 ---
 name:            test_combine_trunc_anyext_s32_s16
+legalized: true
 body:             |
   bb.1:
   liveins: $h0
-    ; CHECK-LABEL: name: test_combine_trunc_anyext_s32_s16
-    ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s16)
-    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
+    ; CHECK-PRE-LABEL: name: test_combine_trunc_anyext_s32_s16
+    ; CHECK-PRE: liveins: $h0
+    ; CHECK-PRE-NEXT: {{  $}}
+    ; CHECK-PRE-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
+    ; CHECK-PRE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s16)
+    ; CHECK-PRE-NEXT: $w0 = COPY [[ANYEXT]](s32)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_trunc_anyext_s32_s16
+    ; CHECK-POST: liveins: $h0
+    ; CHECK-POST-NEXT: {{  $}}
+    ; CHECK-POST-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
+    ; CHECK-POST-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s16)
+    ; CHECK-POST-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ANYEXT]](s64)
+    ; CHECK-POST-NEXT: $w0 = COPY [[TRUNC]](s32)
     %0:_(s16) = COPY $h0
     %1:_(s64) = G_ANYEXT %0(s16)
     %2:_(s32) = G_TRUNC %1(s64)
@@ -38,13 +53,24 @@ body:             |
 ...
 ---
 name:            test_combine_trunc_anyext_s32_s16_vec
+legalized: true
 body:             |
   bb.1:
   liveins: $s0
-    ; CHECK-LABEL: name: test_combine_trunc_anyext_s32_s16_vec
-    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $s0
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(<2 x s32>) = G_ANYEXT [[COPY]](<2 x s16>)
-    ; CHECK: $x0 = COPY [[ANYEXT]](<2 x s32>)
+    ; CHECK-PRE-LABEL: name: test_combine_trunc_anyext_s32_s16_vec
+    ; CHECK-PRE: liveins: $s0
+    ; CHECK-PRE-NEXT: {{  $}}
+    ; CHECK-PRE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $s0
+    ; CHECK-PRE-NEXT: [[ANYEXT:%[0-9]+]]:_(<2 x s32>) = G_ANYEXT [[COPY]](<2 x s16>)
+    ; CHECK-PRE-NEXT: $x0 = COPY [[ANYEXT]](<2 x s32>)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_trunc_anyext_s32_s16_vec
+    ; CHECK-POST: liveins: $s0
+    ; CHECK-POST-NEXT: {{  $}}
+    ; CHECK-POST-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $s0
+    ; CHECK-POST-NEXT: [[ANYEXT:%[0-9]+]]:_(<2 x s64>) = G_ANYEXT [[COPY]](<2 x s16>)
+    ; CHECK-POST-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[ANYEXT]](<2 x s64>)
+    ; CHECK-POST-NEXT: $x0 = COPY [[TRUNC]](<2 x s32>)
     %0:_(<2 x s16>) = COPY $s0
     %1:_(<2 x s64>) = G_ANYEXT %0(<2 x s16>)
     %2:_(<2 x s32>) = G_TRUNC %1(<2 x s64>)
@@ -52,13 +78,24 @@ body:             |
 ...
 ---
 name:            test_combine_trunc_sext_s32_s16
+legalized: true
 body:             |
   bb.1:
   liveins: $h0
-    ; CHECK-LABEL: name: test_combine_trunc_sext_s32_s16
-    ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
-    ; CHECK: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s16)
-    ; CHECK: $w0 = COPY [[SEXT]](s32)
+    ; CHECK-PRE-LABEL: name: test_combine_trunc_sext_s32_s16
+    ; CHECK-PRE: liveins: $h0
+    ; CHECK-PRE-NEXT: {{  $}}
+    ; CHECK-PRE-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
+    ; CHECK-PRE-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s16)
+    ; CHECK-PRE-NEXT: $w0 = COPY [[SEXT]](s32)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_trunc_sext_s32_s16
+    ; CHECK-POST: liveins: $h0
+    ; CHECK-POST-NEXT: {{  $}}
+    ; CHECK-POST-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
+    ; CHECK-POST-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s16)
+    ; CHECK-POST-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[SEXT]](s64)
+    ; CHECK-POST-NEXT: $w0 = COPY [[TRUNC]](s32)
     %0:_(s16) = COPY $h0
     %1:_(s64) = G_SEXT %0(s16)
     %2:_(s32) = G_TRUNC %1(s64)
@@ -66,13 +103,24 @@ body:             |
 ...
 ---
 name:            test_combine_trunc_zext_s32_s16
+legalized: true
 body:             |
   bb.1:
   liveins: $h0
-    ; CHECK-LABEL: name: test_combine_trunc_zext_s32_s16
-    ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
-    ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s16)
-    ; CHECK: $w0 = COPY [[ZEXT]](s32)
+    ; CHECK-PRE-LABEL: name: test_combine_trunc_zext_s32_s16
+    ; CHECK-PRE: liveins: $h0
+    ; CHECK-PRE-NEXT: {{  $}}
+    ; CHECK-PRE-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
+    ; CHECK-PRE-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s16)
+    ; CHECK-PRE-NEXT: $w0 = COPY [[ZEXT]](s32)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_trunc_zext_s32_s16
+    ; CHECK-POST: liveins: $h0
+    ; CHECK-POST-NEXT: {{  $}}
+    ; CHECK-POST-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
+    ; CHECK-POST-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s16)
+    ; CHECK-POST-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ZEXT]](s64)
+    ; CHECK-POST-NEXT: $w0 = COPY [[TRUNC]](s32)
     %0:_(s16) = COPY $h0
     %1:_(s64) = G_ZEXT %0(s16)
     %2:_(s32) = G_TRUNC %1(s64)
@@ -80,12 +128,23 @@ body:             |
 ...
 ---
 name:            test_combine_trunc_anyext_s32_s32
+legalized: true
 body:             |
   bb.1:
   liveins: $w0
-    ; CHECK-LABEL: name: test_combine_trunc_anyext_s32_s32
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
-    ; CHECK: $w0 = COPY [[COPY]](s32)
+    ; CHECK-PRE-LABEL: name: test_combine_trunc_anyext_s32_s32
+    ; CHECK-PRE: liveins: $w0
+    ; CHECK-PRE-NEXT: {{  $}}
+    ; CHECK-PRE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: $w0 = COPY [[COPY]](s32)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_trunc_anyext_s32_s32
+    ; CHECK-POST: liveins: $w0
+    ; CHECK-POST-NEXT: {{  $}}
+    ; CHECK-POST-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; CHECK-POST-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ANYEXT]](s64)
+    ; CHECK-POST-NEXT: $w0 = COPY [[TRUNC]](s32)
     %0:_(s32) = COPY $w0
     %1:_(s64) = G_ANYEXT %0(s32)
     %2:_(s32) = G_TRUNC %1(s64)
@@ -93,13 +152,24 @@ body:             |
 ...
 ---
 name:            test_combine_trunc_anyext_s32_s64
+legalized: true
 body:             |
   bb.1:
   liveins: $x0
-    ; CHECK-LABEL: name: test_combine_trunc_anyext_s32_s64
-    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK: $w0 = COPY [[TRUNC]](s32)
+    ; CHECK-PRE-LABEL: name: test_combine_trunc_anyext_s32_s64
+    ; CHECK-PRE: liveins: $x0
+    ; CHECK-PRE-NEXT: {{  $}}
+    ; CHECK-PRE-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-PRE-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-PRE-NEXT: $w0 = COPY [[TRUNC]](s32)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_trunc_anyext_s32_s64
+    ; CHECK-POST: liveins: $x0
+    ; CHECK-POST-NEXT: {{  $}}
+    ; CHECK-POST-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-POST-NEXT: [[ANYEXT:%[0-9]+]]:_(s128) = G_ANYEXT [[COPY]](s64)
+    ; CHECK-POST-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ANYEXT]](s128)
+    ; CHECK-POST-NEXT: $w0 = COPY [[TRUNC]](s32)
     %0:_(s64) = COPY $x0
     %1:_(s128) = G_ANYEXT %0(s64)
     %2:_(s32) = G_TRUNC %1(s128)
@@ -107,15 +177,27 @@ body:             |
 ...
 ---
 name:            test_combine_trunc_shl_s32_by_2
+legalized: true
 body:             |
   bb.1:
   liveins: $w0
-    ; CHECK-LABEL: name: test_combine_trunc_shl_s32_by_2
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; CHECK: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s32)
-    ; CHECK: $h0 = COPY [[SHL]](s16)
+    ; CHECK-PRE-LABEL: name: test_combine_trunc_shl_s32_by_2
+    ; CHECK-PRE: liveins: $w0
+    ; CHECK-PRE-NEXT: {{  $}}
+    ; CHECK-PRE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-PRE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-PRE-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s32)
+    ; CHECK-PRE-NEXT: $h0 = COPY [[SHL]](s16)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_trunc_shl_s32_by_2
+    ; CHECK-POST: liveins: $w0
+    ; CHECK-POST-NEXT: {{  $}}
+    ; CHECK-POST-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-POST-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-POST-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
+    ; CHECK-POST-NEXT: $h0 = COPY [[TRUNC]](s16)
     %0:_(s32) = COPY $w0
     %1:_(s32) = G_CONSTANT i32 2
     %2:_(s32) = G_SHL %0(s32), %1(s32)
@@ -124,18 +206,41 @@ body:             |
 ...
 ---
 name:            test_combine_trunc_shl_s32_by_17
+legalized: true
 body:             |
   bb.1:
   liveins: $w0
     ; CHECK-LABEL: name: test_combine_trunc_shl_s32_by_17
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 17
-    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
-    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
-    ; CHECK: $h0 = COPY [[TRUNC]](s16)
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 17
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
+    ; CHECK-NEXT: $h0 = COPY [[TRUNC]](s16)
     %0:_(s32) = COPY $w0
     %1:_(s32) = G_CONSTANT i32 17
     %2:_(s32) = G_SHL %0(s32), %1(s32)
     %3:_(s16) = G_TRUNC %2(s32)
     $h0 = COPY %3(s16)
 ...
+---
+name:            test_combine_trunc_multi_use
+legalized: true
+body:             |
+  bb.1:
+  liveins: $w0
+    ; CHECK-LABEL: name: test_combine_trunc_multi_use
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[ZEXT]](s64)
+    ; CHECK-NEXT: $h0 = COPY [[TRUNC]](s16)
+    ; CHECK-NEXT: $x0 = COPY [[ZEXT]](s64)
+    %0:_(s32) = COPY $w0
+    %2:_(s64) = G_ZEXT %0(s32)
+    %3:_(s16) = G_TRUNC %2(s64)
+    $h0 = COPY %3(s16)
+    $x0 = COPY %2(s64)
+...

>From 10ff2bcb5eaf169b0d9f6f12851ccae339a54aaf Mon Sep 17 00:00:00 2001
From: Narayan <32898329+vortex73 at users.noreply.github.com>
Date: Thu, 25 Jul 2024 02:12:22 +0530
Subject: [PATCH 069/259] [Clang] Refactor uses of  `Cand->Function` in
 SemaOverload.cpp (#98965)

- [ ] adds checks to called functions containing `Cand->Function` as an
argument.
- [ ] Assigned `Cand->Function` as a `FunctionDecl*` to enhance
readablity.
Solves: #98769 and #98942
---
 clang/lib/Sema/SemaOverload.cpp | 36 +++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 554a2df14bea6..86074a4f3a585 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -47,6 +47,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
 #include <algorithm>
+#include <cassert>
 #include <cstddef>
 #include <cstdlib>
 #include <optional>
@@ -9977,8 +9978,9 @@ Sema::AddArgumentDependentLookupCandidates(DeclarationName Name,
                                    CandEnd = CandidateSet.end();
        Cand != CandEnd; ++Cand)
     if (Cand->Function) {
-      Fns.erase(Cand->Function);
-      if (FunctionTemplateDecl *FunTmpl = Cand->Function->getPrimaryTemplate())
+      FunctionDecl *Fn = Cand->Function;
+      Fns.erase(Fn);
+      if (FunctionTemplateDecl *FunTmpl = Fn->getPrimaryTemplate())
         Fns.erase(FunTmpl);
     }
 
@@ -11332,8 +11334,7 @@ static void DiagnoseBadConversion(Sema &S, OverloadCandidate *Cand,
     }
   }
 
-  if (TakingCandidateAddress &&
-      !checkAddressOfCandidateIsAvailable(S, Cand->Function))
+  if (TakingCandidateAddress && !checkAddressOfCandidateIsAvailable(S, Fn))
     return;
 
   // Emit the generic diagnostic and, optionally, add the hints to it.
@@ -11359,6 +11360,7 @@ static void DiagnoseBadConversion(Sema &S, OverloadCandidate *Cand,
 /// over a candidate in any candidate set.
 static bool CheckArityMismatch(Sema &S, OverloadCandidate *Cand,
                                unsigned NumArgs, bool IsAddressOf = false) {
+  assert(Cand->Function && "Candidate is required to be a function.");
   FunctionDecl *Fn = Cand->Function;
   unsigned MinParams = Fn->getMinRequiredExplicitArguments() +
                        ((IsAddressOf && !Fn->isStatic()) ? 1 : 0);
@@ -11449,8 +11451,10 @@ static void DiagnoseArityMismatch(Sema &S, NamedDecl *Found, Decl *D,
 /// Arity mismatch diagnosis specific to a function overload candidate.
 static void DiagnoseArityMismatch(Sema &S, OverloadCandidate *Cand,
                                   unsigned NumFormalArgs) {
+  assert(Cand->Function && "Candidate must be a function");
+  FunctionDecl *Fn = Cand->Function;
   if (!CheckArityMismatch(S, Cand, NumFormalArgs, Cand->TookAddressOfOverload))
-    DiagnoseArityMismatch(S, Cand->FoundDecl, Cand->Function, NumFormalArgs,
+    DiagnoseArityMismatch(S, Cand->FoundDecl, Fn, NumFormalArgs,
                           Cand->TookAddressOfOverload);
 }
 
@@ -11750,19 +11754,22 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
 static void DiagnoseBadDeduction(Sema &S, OverloadCandidate *Cand,
                                  unsigned NumArgs,
                                  bool TakingCandidateAddress) {
+  assert(Cand->Function && "Candidate must be a function");
+  FunctionDecl *Fn = Cand->Function;
   TemplateDeductionResult TDK = Cand->DeductionFailure.getResult();
   if (TDK == TemplateDeductionResult::TooFewArguments ||
       TDK == TemplateDeductionResult::TooManyArguments) {
     if (CheckArityMismatch(S, Cand, NumArgs))
       return;
   }
-  DiagnoseBadDeduction(S, Cand->FoundDecl, Cand->Function, // pattern
+  DiagnoseBadDeduction(S, Cand->FoundDecl, Fn, // pattern
                        Cand->DeductionFailure, NumArgs, TakingCandidateAddress);
 }
 
 /// CUDA: diagnose an invalid call across targets.
 static void DiagnoseBadTarget(Sema &S, OverloadCandidate *Cand) {
   FunctionDecl *Caller = S.getCurFunctionDecl(/*AllowLambda=*/true);
+  assert(Cand->Function && "Candidate must be a Function.");
   FunctionDecl *Callee = Cand->Function;
 
   CUDAFunctionTarget CallerTarget = S.CUDA().IdentifyTarget(Caller),
@@ -11820,6 +11827,7 @@ static void DiagnoseBadTarget(Sema &S, OverloadCandidate *Cand) {
 }
 
 static void DiagnoseFailedEnableIfAttr(Sema &S, OverloadCandidate *Cand) {
+  assert(Cand->Function && "Candidate must be a function");
   FunctionDecl *Callee = Cand->Function;
   EnableIfAttr *Attr = static_cast<EnableIfAttr*>(Cand->DeductionFailure.Data);
 
@@ -11829,11 +11837,13 @@ static void DiagnoseFailedEnableIfAttr(Sema &S, OverloadCandidate *Cand) {
 }
 
 static void DiagnoseFailedExplicitSpec(Sema &S, OverloadCandidate *Cand) {
-  ExplicitSpecifier ES = ExplicitSpecifier::getFromDecl(Cand->Function);
+  assert(Cand->Function && "Candidate must be a function");
+  FunctionDecl *Fn = Cand->Function;
+  ExplicitSpecifier ES = ExplicitSpecifier::getFromDecl(Fn);
   assert(ES.isExplicit() && "not an explicit candidate");
 
   unsigned Kind;
-  switch (Cand->Function->getDeclKind()) {
+  switch (Fn->getDeclKind()) {
   case Decl::Kind::CXXConstructor:
     Kind = 0;
     break;
@@ -11841,7 +11851,7 @@ static void DiagnoseFailedExplicitSpec(Sema &S, OverloadCandidate *Cand) {
     Kind = 1;
     break;
   case Decl::Kind::CXXDeductionGuide:
-    Kind = Cand->Function->isImplicit() ? 0 : 2;
+    Kind = Fn->isImplicit() ? 0 : 2;
     break;
   default:
     llvm_unreachable("invalid Decl");
@@ -11851,7 +11861,7 @@ static void DiagnoseFailedExplicitSpec(Sema &S, OverloadCandidate *Cand) {
   // (particularly an out-of-class definition) will typically lack the
   // 'explicit' specifier.
   // FIXME: This is probably a good thing to do for all 'candidate' notes.
-  FunctionDecl *First = Cand->Function->getFirstDecl();
+  FunctionDecl *First = Fn->getFirstDecl();
   if (FunctionDecl *Pattern = First->getTemplateInstantiationPattern())
     First = Pattern->getFirstDecl();
 
@@ -11920,6 +11930,7 @@ static void NoteFunctionCandidate(Sema &S, OverloadCandidate *Cand,
                                   unsigned NumArgs,
                                   bool TakingCandidateAddress,
                                   LangAS CtorDestAS = LangAS::Default) {
+  assert(Cand->Function && "Candidate must be a function");
   FunctionDecl *Fn = Cand->Function;
   if (shouldSkipNotingLambdaConversionDecl(Fn))
     return;
@@ -11934,8 +11945,7 @@ static void NoteFunctionCandidate(Sema &S, OverloadCandidate *Cand,
   // Skip implicit member functions when trying to resolve
   // the address of a an overload set for a function pointer.
   if (Cand->TookAddressOfOverload &&
-      !Cand->Function->hasCXXExplicitFunctionObjectParameter() &&
-      !Cand->Function->isStatic())
+      !Fn->hasCXXExplicitFunctionObjectParameter() && !Fn->isStatic())
     return;
 
   // Note deleted candidates, but only if they're viable.
@@ -12033,7 +12043,7 @@ static void NoteFunctionCandidate(Sema &S, OverloadCandidate *Cand,
     return;
 
   case ovl_fail_addr_not_available: {
-    bool Available = checkAddressOfCandidateIsAvailable(S, Cand->Function);
+    bool Available = checkAddressOfCandidateIsAvailable(S, Fn);
     (void)Available;
     assert(!Available);
     break;

>From 0431d6dab40b05d9f4a312a9c170c81a889bfb49 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight at google.com>
Date: Wed, 24 Jul 2024 17:00:12 -0400
Subject: [PATCH 070/259] Clang: convert `__m64` intrinsics to unconditionally
 use SSE2 instead of MMX. (#96540)

The MMX instruction set is legacy, and the SSE2 variants are in every
way superior, when they are available -- and they have been available
since the Pentium 4 was released, 20 years ago.

Therefore, we are switching the "MMX" intrinsics to depend on SSE2,
unconditionally. This change entirely drops the ability to generate
vectorized code using compiler intrinsics for chips with MMX but without
SSE2: the Intel Pentium MMX, Pentium, II, and Pentium III (released
1997-1999), as well as AMD K6 and K7 series chips of around the same
timeframe. Targeting these older CPUs remains supported -- simply
without the ability to use MMX compiler intrinsics.

Migrating away from the use of MMX registers also fixes a rather
non-obvious requirement. The long-standing programming model for these
MMX intrinsics requires that the programmer be aware of the x87/MMX
mode-switching semantics, and manually call `_mm_empty()` between using
any MMX instruction and any x87 FPU instruction. If you neglect to, then
every future x87 operation will return a NaN result. This requirement is
not at all obvious to users of these these intrinsic functions, and
causes very difficult to detect bugs.

Worse, even if the user did write code that correctly calls
`_mm_empty()` in the right places, LLVM may sometimes reorder x87 and
mmx operations around each-other, unaware of this mode switching issue.

Eliminating the use of MMX registers eliminates this problem.

This change also deletes the now-unnecessary MMX `__builtin_ia32_*`
functions from Clang. Only 3 MMX-related builtins remain in use --
`__builtin_ia32_emms`, used by `_mm_empty`, and
`__builtin_ia32_vec_{ext,set}_v4si`, used by `_mm_insert_pi16` and
`_mm_extract_pi16`. Note particularly that the latter two lower to
generic, non-MMX, IR. Support for the LLVM intrinsics underlying these
removed builtins still remains, for the moment.

The file `clang/www/builtins.py` has been updated with mappings from the
newly-removed `__builtin_ia32` functions to the still-supported
equivalents in `mmintrin.h`.

(Originally uploaded at https://reviews.llvm.org/D86855 and
https://reviews.llvm.org/D94252)

Fixes issue #41665
Works towards #98272
---
 clang/docs/ReleaseNotes.rst                   |  14 +
 clang/include/clang/Basic/BuiltinsX86.def     | 103 +-----
 clang/lib/CodeGen/CGBuiltin.cpp               |   8 +-
 clang/lib/Headers/emmintrin.h                 |  41 +--
 clang/lib/Headers/mmintrin.h                  | 308 ++++++++++--------
 clang/lib/Headers/tmmintrin.h                 |  96 +++---
 clang/lib/Headers/xmmintrin.h                 | 193 +++++------
 clang/lib/Sema/SemaX86.cpp                    |   1 -
 clang/test/CodeGen/X86/mmx-builtins.c         | 209 ++++++------
 clang/test/CodeGen/X86/mmx-inline-asm.c       |   2 +-
 .../CodeGen/X86/mmx-shift-with-immediate.c    |  18 +-
 clang/test/CodeGen/attr-target-x86-mmx.c      |   7 +-
 clang/test/CodeGen/builtins-x86.c             |  78 +----
 clang/test/CodeGen/palignr.c                  |  15 -
 clang/test/CodeGen/pr26099.c                  |  12 -
 clang/test/Headers/xmmintrin.c                |   2 +-
 clang/test/Sema/x86-builtin-palignr.c         |   2 +-
 clang/www/builtins.py                         |  98 +++++-
 llvm/include/llvm/IR/IntrinsicsX86.td         | 182 +++++------
 19 files changed, 687 insertions(+), 702 deletions(-)
 delete mode 100644 clang/test/CodeGen/pr26099.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7a09822a1dd52..e2a680d73ab05 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -172,6 +172,20 @@ AMDGPU Support
 X86 Support
 ^^^^^^^^^^^
 
+- The MMX vector intrinsic functions from ``*mmintrin.h`` which
+  operate on `__m64` vectors, such as ``_mm_add_pi8``, have been
+  reimplemented to use the SSE2 instruction-set and XMM registers
+  unconditionally. These intrinsics are therefore *no longer
+  supported* if MMX is enabled without SSE2 -- either from targeting
+  CPUs from the Pentium-MMX through the Pentium 3, or explicitly via
+  passing arguments such as ``-mmmx -mno-sse2``.
+
+- The compiler builtins such as ``__builtin_ia32_paddb`` which
+  formerly implemented the above MMX intrinsic functions have been
+  removed. Any uses of these removed functions should migrate to the
+  functions defined by the ``*mmintrin.h`` headers. A mapping can be
+  found in the file ``clang/www/builtins.py``.
+
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index a85e7918f4d7e..06ca30d65f5bd 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -47,107 +47,8 @@ TARGET_BUILTIN(__builtin_ia32_writeeflags_u32, "vUi", "n", "")
 // doesn't work in the presence of re-declaration of _mm_prefetch for windows.
 TARGET_BUILTIN(_mm_prefetch, "vcC*i", "nc", "mmx")
 TARGET_BUILTIN(__builtin_ia32_emms, "v", "n", "mmx")
-TARGET_BUILTIN(__builtin_ia32_paddb, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_paddw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_paddd, "V2iV2iV2i", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_paddsb, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_paddsw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_paddusb, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_paddusw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psubb, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psubw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psubd, "V2iV2iV2i", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psubsb, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psubsw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psubusb, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psubusw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pmulhw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pmullw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pmaddwd, "V2iV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pand, "V1OiV1OiV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pandn, "V1OiV1OiV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_por, "V1OiV1OiV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pxor, "V1OiV1OiV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psllw, "V4sV4sV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pslld, "V2iV2iV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psllq, "V1OiV1OiV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psrlw, "V4sV4sV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psrld, "V2iV2iV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psrlq, "V1OiV1OiV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psraw, "V4sV4sV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psrad, "V2iV2iV1Oi", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psllwi, "V4sV4si", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pslldi, "V2iV2ii", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psllqi, "V1OiV1Oii", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psrlwi, "V4sV4si", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psrldi, "V2iV2ii", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psrlqi, "V1OiV1Oii", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psrawi, "V4sV4si", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_psradi, "V2iV2ii", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_packsswb, "V8cV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_packssdw, "V4sV2iV2i", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_packuswb, "V8cV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_punpckhbw, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_punpckhwd, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_punpckhdq, "V2iV2iV2i", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_punpcklbw, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_punpcklwd, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_punpckldq, "V2iV2iV2i", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pcmpeqb, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pcmpeqw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pcmpeqd, "V2iV2iV2i", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pcmpgtb, "V8cV8cV8c", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pcmpgtw, "V4sV4sV4s", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_pcmpgtd, "V2iV2iV2i", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_maskmovq, "vV8cV8cc*", "nV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_movntq, "vV1Oi*V1Oi", "nV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_vec_init_v2si, "V2iii", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_vec_init_v4hi, "V4sssss", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_vec_init_v8qi, "V8ccccccccc", "ncV:64:", "mmx")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v2si, "iV2ii", "ncV:64:", "mmx")
-
-// MMX2 (MMX+SSE) intrinsics
-TARGET_BUILTIN(__builtin_ia32_cvtpi2ps, "V4fV4fV2i", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_cvtps2pi, "V2iV4f", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_cvttps2pi, "V2iV4f", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_pavgb, "V8cV8cV8c", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_pavgw, "V4sV4sV4s", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_pmaxsw, "V4sV4sV4s", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_pmaxub, "V8cV8cV8c", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_pminsw, "V4sV4sV4s", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_pminub, "V8cV8cV8c", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_pmovmskb, "iV8c", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_pmulhuw, "V4sV4sV4s", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_psadbw, "V4sV8cV8c", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_pshufw, "V4sV4sIc", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v4hi, "iV4sIi", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v4hi, "V4sV4siIi", "ncV:64:", "mmx,sse")
-
-// MMX+SSE2
-TARGET_BUILTIN(__builtin_ia32_cvtpd2pi, "V2iV2d", "ncV:64:", "mmx,sse2")
-TARGET_BUILTIN(__builtin_ia32_cvtpi2pd, "V2dV2i", "ncV:64:", "mmx,sse2")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2pi, "V2iV2d", "ncV:64:", "mmx,sse2")
-TARGET_BUILTIN(__builtin_ia32_paddq, "V1OiV1OiV1Oi", "ncV:64:", "mmx,sse2")
-TARGET_BUILTIN(__builtin_ia32_pmuludq, "V1OiV2iV2i", "ncV:64:", "mmx,sse2")
-TARGET_BUILTIN(__builtin_ia32_psubq, "V1OiV1OiV1Oi", "ncV:64:", "mmx,sse2")
-
-// MMX+SSSE3
-TARGET_BUILTIN(__builtin_ia32_pabsb, "V8cV8c", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_pabsd, "V2iV2i", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_pabsw, "V4sV4s", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_palignr, "V8cV8cV8cIc", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_phaddd, "V2iV2iV2i", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_phaddsw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_phaddw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_phsubd, "V2iV2iV2i", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_phsubsw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_phsubw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_pmaddubsw, "V8cV8cV8c", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_pmulhrsw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_pshufb, "V8cV8cV8c", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_psignw, "V4sV4sV4s", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_psignb, "V8cV8cV8c", "ncV:64:", "mmx,ssse3")
-TARGET_BUILTIN(__builtin_ia32_psignd, "V2iV2iV2i", "ncV:64:", "mmx,ssse3")
+TARGET_BUILTIN(__builtin_ia32_vec_ext_v4hi, "sV4sIi", "ncV:64:", "sse")
+TARGET_BUILTIN(__builtin_ia32_vec_set_v4hi, "V4sV4ssIi", "ncV:64:", "sse")
 
 // SSE intrinsics.
 TARGET_BUILTIN(__builtin_ia32_comieq, "iV4fV4f", "ncV:128:", "sse")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index f1dee801e4fe7..0c4d0efb70ea5 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14523,12 +14523,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     // TODO: If we had a "freeze" IR instruction to generate a fixed undef
     // value, we should use that here instead of a zero.
     return llvm::Constant::getNullValue(ConvertType(E->getType()));
-  case X86::BI__builtin_ia32_vec_init_v8qi:
-  case X86::BI__builtin_ia32_vec_init_v4hi:
-  case X86::BI__builtin_ia32_vec_init_v2si:
-    return Builder.CreateBitCast(BuildVector(Ops),
-                                 llvm::Type::getX86_MMXTy(getLLVMContext()));
-  case X86::BI__builtin_ia32_vec_ext_v2si:
+  case X86::BI__builtin_ia32_vec_ext_v4hi:
   case X86::BI__builtin_ia32_vec_ext_v16qi:
   case X86::BI__builtin_ia32_vec_ext_v8hi:
   case X86::BI__builtin_ia32_vec_ext_v4si:
@@ -14546,6 +14541,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     // Otherwise we could just do this in the header file.
     return Builder.CreateExtractElement(Ops[0], Index);
   }
+  case X86::BI__builtin_ia32_vec_set_v4hi:
   case X86::BI__builtin_ia32_vec_set_v16qi:
   case X86::BI__builtin_ia32_vec_set_v8hi:
   case X86::BI__builtin_ia32_vec_set_v4si:
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index e85bfc47aa5cc..a3176570a468f 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -52,9 +52,12 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__,                               \
                  __target__("sse2,no-evex512"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX                                                 \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("mmx,sse2,no-evex512"), __min_vector_width__(64)))
+
+#define __trunc64(x)                                                           \
+  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __anyext128(x)                                                         \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, -1, -1)
 
 /// Adds lower double-precision values in both operands and returns the
 ///    sum in the lower 64 bits of the result. The upper 64 bits of the result
@@ -1486,8 +1489,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
 /// \param __a
 ///    A 128-bit vector of [2 x double].
 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
-  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a) {
+  return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a));
 }
 
 /// Converts the two double-precision floating-point elements of a
@@ -1505,8 +1508,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
 /// \param __a
 ///    A 128-bit vector of [2 x double].
 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
-  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a) {
+  return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a));
 }
 
 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
@@ -1520,8 +1523,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
 /// \param __a
 ///    A 64-bit vector of [2 x i32].
 /// \returns A 128-bit vector of [2 x double] containing the converted values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
-  return __builtin_ia32_cvtpi2pd((__v2si)__a);
+static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtpi32_pd(__m64 __a) {
+  return (__m128d) __builtin_convertvector((__v2si)__a, __v2df);
 }
 
 /// Returns the low-order element of a 128-bit vector of [2 x double] as
@@ -2108,9 +2111,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
 /// \param __b
 ///    A 64-bit integer.
 /// \returns A 64-bit integer containing the sum of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
-                                                            __m64 __b) {
-  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) {
+  return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b));
 }
 
 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
@@ -2431,9 +2433,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
 /// \param __b
 ///    A 64-bit integer containing one of the source operands.
 /// \returns A 64-bit integer vector containing the product of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
-                                                            __m64 __b) {
-  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
+  return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a),
+                                             (__v4si)__anyext128(__b)));
 }
 
 /// Multiplies 32-bit unsigned integer values contained in the lower
@@ -2539,9 +2541,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
 ///    A 64-bit integer vector containing the subtrahend.
 /// \returns A 64-bit integer vector containing the difference of the values in
 ///    the operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
-                                                            __m64 __b) {
-  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b) {
+  return (__m64)((unsigned long long)__a - (unsigned long long)__b);
 }
 
 /// Subtracts the corresponding elements of two [2 x i64] vectors.
@@ -4889,8 +4890,10 @@ void _mm_pause(void);
 #if defined(__cplusplus)
 } // extern "C"
 #endif
+
+#undef __anyext128
+#undef __trunc64
 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
 
 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
 
diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h
index 4e154e2d85935..9d1e135be63be 100644
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@@ -21,10 +21,33 @@ typedef int __v2si __attribute__((__vector_size__(8)));
 typedef short __v4hi __attribute__((__vector_size__(8)));
 typedef char __v8qi __attribute__((__vector_size__(8)));
 
+/* Unsigned types */
+typedef unsigned long long __v1du __attribute__ ((__vector_size__ (8)));
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+typedef unsigned short __v4hu __attribute__((__vector_size__(8)));
+typedef unsigned char __v8qu __attribute__((__vector_size__(8)));
+
+/* We need an explicitly signed variant for char. Note that this shouldn't
+ * appear in the interface though. */
+typedef signed char __v8qs __attribute__((__vector_size__(8)));
+
+/* SSE/SSE2 types */
+typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
+typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef short __v8hi __attribute__((__vector_size__(16)));
+typedef char __v16qi __attribute__((__vector_size__(16)));
+
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("mmx,no-evex512"), \
-                 __min_vector_width__(64)))
+#define __DEFAULT_FN_ATTRS_SSE2                                                \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("sse2,no-evex512"), __min_vector_width__(128)))
+
+#define __trunc64(x)                                                           \
+  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __anyext128(x)                                                         \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, -1, -1)
 
 /// Clears the MMX state by setting the state of the x87 stack registers
 ///    to empty.
@@ -50,10 +73,10 @@ _mm_empty(void) {
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
 ///    parameter. The upper 32 bits are set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtsi32_si64(int __i)
 {
-    return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
+    return __extension__ (__m64)(__v2si){__i, 0};
 }
 
 /// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
@@ -67,10 +90,10 @@ _mm_cvtsi32_si64(int __i)
 ///    A 64-bit integer vector.
 /// \returns A 32-bit signed integer value containing the lower 32 bits of the
 ///    parameter.
-static __inline__ int __DEFAULT_FN_ATTRS
+static __inline__ int __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtsi64_si32(__m64 __m)
 {
-    return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
+    return ((__v2si)__m)[0];
 }
 
 /// Casts a 64-bit signed integer value into a 64-bit integer vector.
@@ -83,7 +106,7 @@ _mm_cvtsi64_si32(__m64 __m)
 ///    A 64-bit signed integer.
 /// \returns A 64-bit integer vector containing the same bitwise pattern as the
 ///    parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtsi64_m64(long long __i)
 {
     return (__m64)__i;
@@ -99,7 +122,7 @@ _mm_cvtsi64_m64(long long __i)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit signed integer containing the same bitwise pattern as the
 ///    parameter.
-static __inline__ long long __DEFAULT_FN_ATTRS
+static __inline__ long long __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtm64_si64(__m64 __m)
 {
     return (long long)__m;
@@ -124,10 +147,11 @@ _mm_cvtm64_si64(__m64 __m)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_packs_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_packsswb128(
+        (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
 }
 
 /// Converts, with saturation, 32-bit signed integers from both 64-bit integer
@@ -149,10 +173,11 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_packs_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
+    return __trunc64(__builtin_ia32_packssdw128(
+        (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
 }
 
 /// Converts, with saturation, 16-bit signed integers from both 64-bit integer
@@ -174,10 +199,11 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_packs_pu16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_packuswb128(
+        (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
 }
 
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
@@ -201,10 +227,11 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2)
 ///    Bits [63:56] are written to bits [63:56] of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
+                                          4, 12, 5, 13, 6, 14, 7, 15);
 }
 
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
@@ -224,10 +251,11 @@ _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
 ///    Bits [63:48] are written to bits [63:48] of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
+                                          2, 6, 3, 7);
 }
 
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
@@ -245,10 +273,10 @@ _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
 ///    the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
+    return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3);
 }
 
 /// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
@@ -272,10 +300,11 @@ _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
 ///    Bits [31:24] are written to bits [63:56] of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
+                                          0, 8, 1, 9, 2, 10, 3, 11);
 }
 
 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
@@ -295,10 +324,11 @@ _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
 ///    Bits [31:16] are written to bits [63:48] of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
+                                          0, 4, 1, 5);
 }
 
 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
@@ -316,10 +346,10 @@ _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
 ///    the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
+    return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2);
 }
 
 /// Adds each 8-bit integer element of the first 64-bit integer vector
@@ -337,10 +367,10 @@ _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_add_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2));
 }
 
 /// Adds each 16-bit integer element of the first 64-bit integer vector
@@ -358,10 +388,10 @@ _mm_add_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_add_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2));
 }
 
 /// Adds each 32-bit integer element of the first 64-bit integer vector
@@ -379,10 +409,10 @@ _mm_add_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_add_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)(((__v2su)__m1) + ((__v2su)__m2));
 }
 
 /// Adds, with saturation, each 8-bit signed integer element of the first
@@ -403,10 +433,10 @@ _mm_add_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
 ///    of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_elementwise_add_sat((__v8qs)__m1, (__v8qs)__m2);
 }
 
 /// Adds, with saturation, each 16-bit signed integer element of the first
@@ -427,10 +457,10 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
 ///    of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_elementwise_add_sat((__v4hi)__m1, (__v4hi)__m2);
 }
 
 /// Adds, with saturation, each 8-bit unsigned integer element of the first
@@ -450,10 +480,10 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pu8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_elementwise_add_sat((__v8qu)__m1, (__v8qu)__m2);
 }
 
 /// Adds, with saturation, each 16-bit unsigned integer element of the first
@@ -473,10 +503,10 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pu16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_elementwise_add_sat((__v4hu)__m1, (__v4hu)__m2);
 }
 
 /// Subtracts each 8-bit integer element of the second 64-bit integer
@@ -494,10 +524,10 @@ _mm_adds_pu16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sub_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2));
 }
 
 /// Subtracts each 16-bit integer element of the second 64-bit integer
@@ -515,10 +545,10 @@ _mm_sub_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sub_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2));
 }
 
 /// Subtracts each 32-bit integer element of the second 64-bit integer
@@ -536,10 +566,10 @@ _mm_sub_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
 /// \returns A 64-bit integer vector of [2 x i32] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sub_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)(((__v2su)__m1) - ((__v2su)__m2));
 }
 
 /// Subtracts, with saturation, each 8-bit signed integer element of the second
@@ -560,10 +590,10 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_elementwise_sub_sat((__v8qs)__m1, (__v8qs)__m2);
 }
 
 /// Subtracts, with saturation, each 16-bit signed integer element of the
@@ -584,10 +614,10 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_elementwise_sub_sat((__v4hi)__m1, (__v4hi)__m2);
 }
 
 /// Subtracts each 8-bit unsigned integer element of the second 64-bit
@@ -608,10 +638,10 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pu8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_elementwise_sub_sat((__v8qu)__m1, (__v8qu)__m2);
 }
 
 /// Subtracts each 16-bit unsigned integer element of the second 64-bit
@@ -632,10 +662,10 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pu16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_elementwise_sub_sat((__v4hu)__m1, (__v4hu)__m2);
 }
 
 /// Multiplies each 16-bit signed integer element of the first 64-bit
@@ -659,10 +689,11 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of
 ///    products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_madd_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1),
+                                               (__v8hi)__anyext128(__m2)));
 }
 
 /// Multiplies each 16-bit signed integer element of the first 64-bit
@@ -680,10 +711,11 @@ _mm_madd_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
 ///    of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__anyext128(__m1),
+                                              (__v8hi)__anyext128(__m2)));
 }
 
 /// Multiplies each 16-bit signed integer element of the first 64-bit
@@ -701,10 +733,10 @@ _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
 ///    of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2));
 }
 
 /// Left-shifts each 16-bit signed integer element of the first
@@ -724,10 +756,11 @@ _mm_mullo_pi16(__m64 __m1, __m64 __m2)
 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
 ///    values. If \a __count is greater or equal to 16, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sll_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psllw128((__v8hi)__anyext128(__m),
+                                             (__v8hi)__anyext128(__count)));
 }
 
 /// Left-shifts each 16-bit signed integer element of a 64-bit integer
@@ -746,10 +779,11 @@ _mm_sll_pi16(__m64 __m, __m64 __count)
 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
 ///    values. If \a __count is greater or equal to 16, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_slli_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psllwi128((__v8hi)__anyext128(__m),
+                                              __count));
 }
 
 /// Left-shifts each 32-bit signed integer element of the first
@@ -769,10 +803,11 @@ _mm_slli_pi16(__m64 __m, int __count)
 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
 ///    values. If \a __count is greater or equal to 32, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sll_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_pslld128((__v4si)__anyext128(__m),
+                                             (__v4si)__anyext128(__count)));
 }
 
 /// Left-shifts each 32-bit signed integer element of a 64-bit integer
@@ -791,10 +826,11 @@ _mm_sll_pi32(__m64 __m, __m64 __count)
 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
 ///    values. If \a __count is greater or equal to 32, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_slli_pi32(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_pslldi128((__v4si)__anyext128(__m),
+                                              __count));
 }
 
 /// Left-shifts the first 64-bit integer parameter by the number of bits
@@ -811,10 +847,11 @@ _mm_slli_pi32(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector containing the left-shifted value. If
 ///     \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sll_si64(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psllq128((__v2di)__anyext128(__m),
+                                             (__v2di)__anyext128(__count)));
 }
 
 /// Left-shifts the first parameter, which is a 64-bit integer, by the
@@ -831,10 +868,11 @@ _mm_sll_si64(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector containing the left-shifted value. If
 ///     \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_slli_si64(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psllqi128((__v2di)__anyext128(__m),
+                                              __count));
 }
 
 /// Right-shifts each 16-bit integer element of the first parameter,
@@ -855,10 +893,11 @@ _mm_slli_si64(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sra_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psraw128((__v8hi)__anyext128(__m),
+                                             (__v8hi)__anyext128(__count)));
 }
 
 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
@@ -878,10 +917,11 @@ _mm_sra_pi16(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srai_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psrawi128((__v8hi)__anyext128(__m),
+                                              __count));
 }
 
 /// Right-shifts each 32-bit integer element of the first parameter,
@@ -902,10 +942,11 @@ _mm_srai_pi16(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sra_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psrad128((__v4si)__anyext128(__m),
+                                             (__v4si)__anyext128(__count)));
 }
 
 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
@@ -925,10 +966,11 @@ _mm_sra_pi32(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srai_pi32(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psradi128((__v4si)__anyext128(__m),
+                                              __count));
 }
 
 /// Right-shifts each 16-bit integer element of the first parameter,
@@ -948,10 +990,11 @@ _mm_srai_pi32(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srl_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psrlw128((__v8hi)__anyext128(__m),
+                                             (__v8hi)__anyext128(__count)));
 }
 
 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
@@ -970,10 +1013,11 @@ _mm_srl_pi16(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srli_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__anyext128(__m),
+                                              __count));
 }
 
 /// Right-shifts each 32-bit integer element of the first parameter,
@@ -993,10 +1037,11 @@ _mm_srli_pi16(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srl_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psrld128((__v4si)__anyext128(__m),
+                                             (__v4si)__anyext128(__count)));
 }
 
 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
@@ -1015,10 +1060,11 @@ _mm_srl_pi32(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srli_pi32(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psrldi128((__v4si)__anyext128(__m),
+                                              __count));
 }
 
 /// Right-shifts the first 64-bit integer parameter by the number of bits
@@ -1035,10 +1081,11 @@ _mm_srli_pi32(__m64 __m, int __count)
 /// \param __count
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srl_si64(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psrlq128((__v2di)__anyext128(__m),
+                                             (__v2di)__anyext128(__count)));
 }
 
 /// Right-shifts the first parameter, which is a 64-bit integer, by the
@@ -1056,10 +1103,11 @@ _mm_srl_si64(__m64 __m, __m64 __count)
 /// \param __count
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srli_si64(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psrlqi128((__v2di)__anyext128(__m),
+                                              __count));
 }
 
 /// Performs a bitwise AND of two 64-bit integer vectors.
@@ -1074,10 +1122,10 @@ _mm_srli_si64(__m64 __m, int __count)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise AND of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_and_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(((__v1du)__m1) & ((__v1du)__m2));
 }
 
 /// Performs a bitwise NOT of the first 64-bit integer vector, and then
@@ -1095,10 +1143,10 @@ _mm_and_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise AND of the second
 ///    parameter and the one's complement of the first parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_andnot_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(~((__v1du)__m1) & ((__v1du)__m2));
 }
 
 /// Performs a bitwise OR of two 64-bit integer vectors.
@@ -1113,10 +1161,10 @@ _mm_andnot_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise OR of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_or_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(((__v1du)__m1) | ((__v1du)__m2));
 }
 
 /// Performs a bitwise exclusive OR of two 64-bit integer vectors.
@@ -1131,10 +1179,10 @@ _mm_or_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_xor_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2));
 }
 
 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
@@ -1153,10 +1201,10 @@ _mm_xor_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2));
 }
 
 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
@@ -1175,10 +1223,10 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2));
 }
 
 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
@@ -1197,10 +1245,10 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)(((__v2si)__m1) == ((__v2si)__m2));
 }
 
 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
@@ -1219,10 +1267,12 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
+  /* This function always performs a signed comparison, but __v8qi is a char
+     which may be signed or unsigned, so use __v8qs. */
+    return (__m64)((__v8qs)__m1 > (__v8qs)__m2);
 }
 
 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
@@ -1241,10 +1291,10 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)((__v4hi)__m1 > (__v4hi)__m2);
 }
 
 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
@@ -1263,10 +1313,10 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)((__v2si)__m1 > (__v2si)__m2);
 }
 
 /// Constructs a 64-bit integer vector initialized to zero.
@@ -1276,7 +1326,7 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
 /// This intrinsic corresponds to the <c> PXOR </c> instruction.
 ///
 /// \returns An initialized 64-bit integer vector with all elements set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setzero_si64(void)
 {
     return __extension__ (__m64){ 0LL };
@@ -1297,10 +1347,10 @@ _mm_setzero_si64(void)
 ///    A 32-bit integer value used to initialize the lower 32 bits of the
 ///    result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set_pi32(int __i1, int __i0)
 {
-    return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
+    return __extension__ (__m64)(__v2si){__i0, __i1};
 }
 
 /// Constructs a 64-bit integer vector initialized with the specified
@@ -1320,10 +1370,10 @@ _mm_set_pi32(int __i1, int __i0)
 /// \param __s0
 ///    A 16-bit integer value used to initialize bits [15:0] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
 {
-    return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
+    return __extension__ (__m64)(__v4hi){__s0, __s1, __s2, __s3};
 }
 
 /// Constructs a 64-bit integer vector initialized with the specified
@@ -1351,12 +1401,12 @@ _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
 /// \param __b0
 ///    An 8-bit integer value used to initialize bits [7:0] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
             char __b1, char __b0)
 {
-    return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
-                                               __b4, __b5, __b6, __b7);
+    return __extension__ (__m64)(__v8qi){__b0, __b1, __b2, __b3,
+                                         __b4, __b5, __b6, __b7};
 }
 
 /// Constructs a 64-bit integer vector of [2 x i32], with each of the
@@ -1372,7 +1422,7 @@ _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
 ///    A 32-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [2 x i32].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set1_pi32(int __i)
 {
     return _mm_set_pi32(__i, __i);
@@ -1391,7 +1441,7 @@ _mm_set1_pi32(int __i)
 ///    A 16-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [4 x i16].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set1_pi16(short __w)
 {
     return _mm_set_pi16(__w, __w, __w, __w);
@@ -1409,7 +1459,7 @@ _mm_set1_pi16(short __w)
 ///    An 8-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [8 x i8].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set1_pi8(char __b)
 {
     return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
@@ -1430,7 +1480,7 @@ _mm_set1_pi8(char __b)
 ///    A 32-bit integer value used to initialize the upper 32 bits of the
 ///    result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setr_pi32(int __i0, int __i1)
 {
     return _mm_set_pi32(__i1, __i0);
@@ -1453,7 +1503,7 @@ _mm_setr_pi32(int __i0, int __i1)
 /// \param __w3
 ///    A 16-bit integer value used to initialize bits [63:48] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
 {
     return _mm_set_pi16(__w3, __w2, __w1, __w0);
@@ -1484,14 +1534,16 @@ _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
 /// \param __b7
 ///    An 8-bit integer value used to initialize bits [63:56] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
              char __b6, char __b7)
 {
     return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
 }
 
-#undef __DEFAULT_FN_ATTRS
+#undef __anyext128
+#undef __trunc64
+#undef __DEFAULT_FN_ATTRS_SSE2
 
 /* Aliases for compatibility. */
 #define _m_empty _mm_empty
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index bf8327b692d1c..bd832ce8dddfd 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -19,11 +19,13 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("ssse3,no-evex512"), __min_vector_width__(64)))
-#define __DEFAULT_FN_ATTRS_MMX                                                 \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("mmx,ssse3,no-evex512"),                           \
-                 __min_vector_width__(64)))
+                 __target__("ssse3,no-evex512"), __min_vector_width__(128)))
+
+#define __trunc64(x)                                                           \
+  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __anyext128(x)                                                         \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, -1, -1)
 
 /// Computes the absolute value of each of the packed 8-bit signed
 ///    integers in the source operand and stores the 8-bit unsigned integer
@@ -37,10 +39,10 @@
 ///    A 64-bit vector of [8 x i8].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi8(__m64 __a)
 {
-    return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
+  return (__m64)__builtin_elementwise_abs((__v8qs)__a);
 }
 
 /// Computes the absolute value of each of the packed 8-bit signed
@@ -73,10 +75,10 @@ _mm_abs_epi8(__m128i __a)
 ///    A 64-bit vector of [4 x i16].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi16(__m64 __a)
 {
-    return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
+    return (__m64)__builtin_elementwise_abs((__v4hi)__a);
 }
 
 /// Computes the absolute value of each of the packed 16-bit signed
@@ -109,10 +111,10 @@ _mm_abs_epi16(__m128i __a)
 ///    A 64-bit vector of [2 x i32].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi32(__m64 __a)
 {
-    return (__m64)__builtin_ia32_pabsd((__v2si)__a);
+    return (__m64)__builtin_elementwise_abs((__v2si)__a);
 }
 
 /// Computes the absolute value of each of the packed 32-bit signed
@@ -177,7 +179,10 @@ _mm_abs_epi32(__m128i __a)
 /// \returns A 64-bit integer vector containing the concatenated right-shifted
 ///    value.
 #define _mm_alignr_pi8(a, b, n) \
-  ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
+  ((__m64)__builtin_shufflevector(                                       \
+       __builtin_ia32_psrldqi128_byteshift(                              \
+           __builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0),      \
+           (n)), __extension__ (__v2di){}, 0))
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 ///    128-bit vectors of [8 x i16].
@@ -242,10 +247,11 @@ _mm_hadd_epi32(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadd_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_phaddw128(
+        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -265,10 +271,11 @@ _mm_hadd_pi16(__m64 __a, __m64 __b)
 ///    destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadd_pi32(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
+    return __trunc64(__builtin_ia32_phaddd128(
+        (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
 }
 
 /// Horizontally adds, with saturation, the adjacent pairs of values contained
@@ -317,10 +324,11 @@ _mm_hadds_epi16(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadds_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_phaddsw128(
+        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -386,10 +394,11 @@ _mm_hsub_epi32(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsub_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_phsubw128(
+        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -409,10 +418,11 @@ _mm_hsub_pi16(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsub_pi32(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
+    return __trunc64(__builtin_ia32_phsubd128(
+        (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
 }
 
 /// Horizontally subtracts, with saturation, the adjacent pairs of values
@@ -461,10 +471,11 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsubs_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_phsubsw128(
+        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -525,10 +536,11 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_maddubs_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
+    return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
+                                                 (__v16qi)__anyext128(__b)));
 }
 
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
@@ -565,10 +577,11 @@ _mm_mulhrs_epi16(__m128i __a, __m128i __b)
 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
 ///    products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__anyext128(__a),
+                                                (__v8hi)__anyext128(__b)));
 }
 
 /// Copies the 8-bit integers from a 128-bit integer vector to the
@@ -614,12 +627,15 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b)
 ///    1: Clear the corresponding byte in the destination. \n
 ///    0: Copy the selected source byte to the corresponding byte in the
 ///    destination. \n
-///    Bits [3:0] select the source byte to be copied.
+///    Bits [2:0] select the source byte to be copied.
 /// \returns A 64-bit integer vector containing the copied or cleared values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_shuffle_pi8(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
+    return __trunc64(__builtin_ia32_pshufb128(
+        (__v16qi)__builtin_shufflevector(
+            (__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1),
+        (__v16qi)__anyext128(__b)));
 }
 
 /// For each 8-bit integer in the first source operand, perform one of
@@ -720,10 +736,11 @@ _mm_sign_epi32(__m128i __a, __m128i __b)
 ///    A 64-bit integer vector containing control bytes corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi8(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
+    return __trunc64(__builtin_ia32_psignb128((__v16qi)__anyext128(__a),
+                                              (__v16qi)__anyext128(__b)));
 }
 
 /// For each 16-bit integer in the first source operand, perform one of
@@ -746,10 +763,11 @@ _mm_sign_pi8(__m64 __a, __m64 __b)
 ///    A 64-bit integer vector containing control words corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_psignw128((__v8hi)__anyext128(__a),
+                                              (__v8hi)__anyext128(__b)));
 }
 
 /// For each 32-bit integer in the first source operand, perform one of
@@ -772,13 +790,15 @@ _mm_sign_pi16(__m64 __a, __m64 __b)
 ///    A 64-bit integer vector containing two control doublewords corresponding
 ///    to positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi32(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
+    return __trunc64(__builtin_ia32_psignd128((__v4si)__anyext128(__a),
+                                              (__v4si)__anyext128(__b)));
 }
 
+#undef __anyext128
+#undef __trunc64
 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
 
 #endif /* __TMMINTRIN_H */
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 1ef89de9c9f56..5b9e90e8f061c 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -35,9 +35,21 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
                  __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX                                                 \
+#define __DEFAULT_FN_ATTRS_SSE2                                                \
   __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("mmx,sse,no-evex512"), __min_vector_width__(64)))
+                 __target__("sse2,no-evex512"), __min_vector_width__(128)))
+
+#define __trunc64(x)                                                           \
+  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __zext128(x)                                                           \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, 2, 3)
+#define __anyext128(x)                                                         \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, -1, -1)
+#define __zeroupper64(x)                                                       \
+  (__m128i) __builtin_shufflevector((__v4si)(x), __extension__(__v4si){}, 0,   \
+                                    1, 4, 5)
 
 /// Adds the 32-bit float values in the low-order bits of the operands.
 ///
@@ -1448,10 +1460,10 @@ _mm_cvtss_si64(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtps_pi32(__m128 __a)
 {
-  return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
+  return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a)));
 }
 
 /// Converts two low-order float values in a 128-bit vector of
@@ -1468,7 +1480,7 @@ _mm_cvtps_pi32(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvt_ps2pi(__m128 __a)
 {
   return _mm_cvtps_pi32(__a);
@@ -1558,10 +1570,10 @@ _mm_cvttss_si64(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvttps_pi32(__m128 __a)
 {
-  return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
+  return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a)));
 }
 
 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
@@ -1579,7 +1591,7 @@ _mm_cvttps_pi32(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtt_ps2pi(__m128 __a)
 {
   return _mm_cvttps_pi32(__a);
@@ -1674,10 +1686,13 @@ _mm_cvtsi64_ss(__m128 __a, long long __b)
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    converted value of the second operand. The upper 64 bits are copied from
 ///    the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
 {
-  return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
+  return (__m128)__builtin_shufflevector(
+      (__v4sf)__a,
+      __builtin_convertvector((__v4si)__zext128(__b), __v4sf),
+      4, 5, 2, 3);
 }
 
 /// Converts two elements of a 64-bit vector of [2 x i32] into two
@@ -1697,7 +1712,7 @@ _mm_cvtpi32_ps(__m128 __a, __m64 __b)
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    converted value from the second operand. The upper 64 bits are copied
 ///    from the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
 {
   return _mm_cvtpi32_ps(__a, __b);
@@ -2231,10 +2246,10 @@ _mm_storer_ps(float *__p, __m128 __a)
 ///    A pointer to an aligned memory location used to store the register value.
 /// \param __a
 ///    A 64-bit integer containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS_MMX
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_pi(void *__p, __m64 __a)
 {
-  __builtin_ia32_movntq((__m64 *)__p, __a);
+  __builtin_nontemporal_store(__a, (__m64 *)__p);
 }
 
 /// Moves packed float values from a 128-bit vector of [4 x float] to a
@@ -2296,7 +2311,7 @@ void _mm_sfence(void);
 ///    3: Bits [63:48] are copied to the destination.
 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
 #define _mm_extract_pi16(a, n) \
-  ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
+  ((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
 
 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
@@ -2342,10 +2357,10 @@ void _mm_sfence(void);
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_max_pi16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
+  return (__m64)__builtin_elementwise_max((__v4hi)__a, (__v4hi)__b);
 }
 
 /// Compares each of the corresponding packed 8-bit unsigned integer
@@ -2361,10 +2376,10 @@ _mm_max_pi16(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_max_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
+  return (__m64)__builtin_elementwise_max((__v8qu)__a, (__v8qu)__b);
 }
 
 /// Compares each of the corresponding packed 16-bit integer values of
@@ -2380,10 +2395,10 @@ _mm_max_pu8(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_min_pi16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
+  return (__m64)__builtin_elementwise_min((__v4hi)__a, (__v4hi)__b);
 }
 
 /// Compares each of the corresponding packed 8-bit unsigned integer
@@ -2399,10 +2414,10 @@ _mm_min_pi16(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_min_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
+  return (__m64)__builtin_elementwise_min((__v8qu)__a, (__v8qu)__b);
 }
 
 /// Takes the most significant bit from each 8-bit element in a 64-bit
@@ -2417,10 +2432,10 @@ _mm_min_pu8(__m64 __a, __m64 __b)
 ///    A 64-bit integer vector containing the values with bits to be extracted.
 /// \returns The most significant bit from each 8-bit element in \a __a,
 ///    written to bits [7:0].
-static __inline__ int __DEFAULT_FN_ATTRS_MMX
+static __inline__ int __DEFAULT_FN_ATTRS_SSE2
 _mm_movemask_pi8(__m64 __a)
 {
-  return __builtin_ia32_pmovmskb((__v8qi)__a);
+  return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a));
 }
 
 /// Multiplies packed 16-bit unsigned integer values and writes the
@@ -2436,10 +2451,11 @@ _mm_movemask_pi8(__m64 __a)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_mulhi_pu16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
+  return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a),
+                                             (__v8hi)__anyext128(__b)));
 }
 
 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
@@ -2476,8 +2492,10 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
 ///    <c>[b6, b4, b2, b0]</c>.
 /// \returns A 64-bit integer vector containing the shuffled values.
-#define _mm_shuffle_pi16(a, n) \
-  ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
+#define _mm_shuffle_pi16(a, n)                                                 \
+  ((__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__(__v4hi){}, \
+                                  (n) & 0x3, ((n) >> 2) & 0x3,                 \
+                                  ((n) >> 4) & 0x3, ((n) >> 6) & 0x3))
 
 /// Conditionally copies the values from each 8-bit element in the first
 ///    64-bit integer vector operand to the specified memory location, as
@@ -2502,10 +2520,25 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
 ///    A pointer to a 64-bit memory location that will receive the conditionally
 ///    copied integer values. The address of the memory location does not have
 ///    to be aligned.
-static __inline__ void __DEFAULT_FN_ATTRS_MMX
+static __inline__ void __DEFAULT_FN_ATTRS_SSE2
 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
 {
-  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
+  // This is complex, because we need to support the case where __p is pointing
+  // within the last 15 to 8 bytes of a page. In that case, using a 128-bit
+  // write might cause a trap where a 64-bit maskmovq would not. (Memory
+  // locations not selected by the mask bits might still cause traps.)
+  __m128i __d128  = __anyext128(__d);
+  __m128i __n128  = __zext128(__n);
+  if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 &&
+      ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) {
+    // If there's a risk of spurious trap due to a 128-bit write, back up the
+    // pointer by 8 bytes and shift values in registers to match.
+    __p -= 8;
+    __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8);
+    __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8);
+  }
+
+  __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
 }
 
 /// Computes the rounded averages of the packed unsigned 8-bit integer
@@ -2521,10 +2554,11 @@ _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_avg_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
+  return __trunc64(__builtin_ia32_pavgb128((__v16qi)__anyext128(__a),
+                                           (__v16qi)__anyext128(__b)));
 }
 
 /// Computes the rounded averages of the packed unsigned 16-bit integer
@@ -2540,10 +2574,11 @@ _mm_avg_pu8(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_avg_pu16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
+  return __trunc64(__builtin_ia32_pavgw128((__v8hi)__anyext128(__a),
+                                           (__v8hi)__anyext128(__b)));
 }
 
 /// Subtracts the corresponding 8-bit unsigned integer values of the two
@@ -2562,10 +2597,11 @@ _mm_avg_pu16(__m64 __a, __m64 __b)
 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
 ///    sets of absolute differences between both operands. The upper bits are
 ///    cleared.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sad_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
+  return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a),
+                                            (__v16qi)__zext128(__b)));
 }
 
 #if defined(__cplusplus)
@@ -2846,22 +2882,10 @@ _mm_movelh_ps(__m128 __a, __m128 __b)
 ///    from the corresponding elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi16_ps(__m64 __a)
 {
-  __m64 __b, __c;
-  __m128 __r;
-
-  __b = _mm_setzero_si64();
-  __b = _mm_cmpgt_pi16(__b, __a);
-  __c = _mm_unpackhi_pi16(__a, __b);
-  __r = _mm_setzero_ps();
-  __r = _mm_cvtpi32_ps(__r, __c);
-  __r = _mm_movelh_ps(__r, __r);
-  __c = _mm_unpacklo_pi16(__a, __b);
-  __r = _mm_cvtpi32_ps(__r, __c);
-
-  return __r;
+  return __builtin_convertvector((__v4hi)__a, __v4sf);
 }
 
 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
@@ -2876,21 +2900,10 @@ _mm_cvtpi16_ps(__m64 __a)
 ///    destination are copied from the corresponding elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpu16_ps(__m64 __a)
 {
-  __m64 __b, __c;
-  __m128 __r;
-
-  __b = _mm_setzero_si64();
-  __c = _mm_unpackhi_pi16(__a, __b);
-  __r = _mm_setzero_ps();
-  __r = _mm_cvtpi32_ps(__r, __c);
-  __r = _mm_movelh_ps(__r, __r);
-  __c = _mm_unpacklo_pi16(__a, __b);
-  __r = _mm_cvtpi32_ps(__r, __c);
-
-  return __r;
+  return __builtin_convertvector((__v4hu)__a, __v4sf);
 }
 
 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
@@ -2905,16 +2918,12 @@ _mm_cvtpu16_ps(__m64 __a)
 ///    from the corresponding lower 4 elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi8_ps(__m64 __a)
 {
-  __m64 __b;
-
-  __b = _mm_setzero_si64();
-  __b = _mm_cmpgt_pi8(__b, __a);
-  __b = _mm_unpacklo_pi8(__a, __b);
-
-  return _mm_cvtpi16_ps(__b);
+  return __builtin_convertvector(
+      __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){},
+                              0, 1, 2, 3), __v4sf);
 }
 
 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
@@ -2930,15 +2939,12 @@ _mm_cvtpi8_ps(__m64 __a)
 ///    operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpu8_ps(__m64 __a)
 {
-  __m64 __b;
-
-  __b = _mm_setzero_si64();
-  __b = _mm_unpacklo_pi8(__a, __b);
-
-  return _mm_cvtpi16_ps(__b);
+  return __builtin_convertvector(
+      __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){},
+                              0, 1, 2, 3), __v4sf);
 }
 
 /// Converts the two 32-bit signed integer values from each 64-bit vector
@@ -2957,16 +2963,12 @@ _mm_cvtpu8_ps(__m64 __a)
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    copied and converted values from the first operand. The upper 64 bits
 ///    contain the copied and converted values from the second operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
 {
-  __m128 __c;
-
-  __c = _mm_setzero_ps();
-  __c = _mm_cvtpi32_ps(__c, __b);
-  __c = _mm_movelh_ps(__c, __c);
-
-  return _mm_cvtpi32_ps(__c, __a);
+  return __builtin_convertvector(
+      __builtin_shufflevector((__v2si)__a, (__v2si)__b,
+                              0, 1, 2, 3), __v4sf);
 }
 
 /// Converts each single-precision floating-point element of a 128-bit
@@ -2986,16 +2988,11 @@ _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
 ///    A 128-bit floating-point vector of [4 x float].
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtps_pi16(__m128 __a)
 {
-  __m64 __b, __c;
-
-  __b = _mm_cvtps_pi32(__a);
-  __a = _mm_movehl_ps(__a, __a);
-  __c = _mm_cvtps_pi32(__a);
-
-  return _mm_packs_pi32(__b, __c);
+  return __trunc64(__builtin_ia32_packssdw128(
+      (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps()));
 }
 
 /// Converts each single-precision floating-point element of a 128-bit
@@ -3016,7 +3013,7 @@ _mm_cvtps_pi16(__m128 __a)
 ///    128-bit floating-point vector of [4 x float].
 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
 ///    converted values and the uppper 32 bits are set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtps_pi8(__m128 __a)
 {
   __m64 __b, __c;
@@ -3196,8 +3193,12 @@ do { \
 #define _m_psadbw _mm_sad_pu8
 #define _m_ _mm_
 
+#undef __trunc64
+#undef __zext128
+#undef __anyext128
+#undef __zeroupper64
 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
+#undef __DEFAULT_FN_ATTRS_SSE2
 
 /* Ugly hack for backwards-compatibility (compatible with gcc) */
 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index be26454ce909d..8f9057bbaf259 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -502,7 +502,6 @@ bool SemaX86::CheckBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
   switch (BuiltinID) {
   default:
     return false;
-  case X86::BI__builtin_ia32_vec_ext_v2si:
   case X86::BI__builtin_ia32_vec_ext_v2di:
   case X86::BI__builtin_ia32_vextractf128_pd256:
   case X86::BI__builtin_ia32_vextractf128_ps256:
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 5b5bc301bddc0..495ae7e181159 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -1,193 +1,200 @@
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s --implicit-check-not=x86mmx
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --implicit-check-not=x86mmx
 
 
 #include <immintrin.h>
 
 __m64 test_mm_abs_pi8(__m64 a) {
   // CHECK-LABEL: test_mm_abs_pi8
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.b
+  // CHECK: call <8 x i8> @llvm.abs.v8i8(
   return _mm_abs_pi8(a);
 }
 
 __m64 test_mm_abs_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_abs_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.w
+  // CHECK: call <4 x i16> @llvm.abs.v4i16(
   return _mm_abs_pi16(a);
 }
 
 __m64 test_mm_abs_pi32(__m64 a) {
   // CHECK-LABEL: test_mm_abs_pi32
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.d
+  // CHECK: call <2 x i32> @llvm.abs.v2i32(
   return _mm_abs_pi32(a);
 }
 
 __m64 test_mm_add_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_add_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.padd.b
+  // CHECK: add <8 x i8> {{%.*}}, {{%.*}}
   return _mm_add_pi8(a, b);
 }
 
 __m64 test_mm_add_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_add_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.padd.w
+  // CHECK: add <4 x i16> {{%.*}}, {{%.*}}
   return _mm_add_pi16(a, b);
 }
 
 __m64 test_mm_add_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_add_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.padd.d
+  // CHECK: add <2 x i32> {{%.*}}, {{%.*}}
   return _mm_add_pi32(a, b);
 }
 
 __m64 test_mm_add_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_add_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+  // CHECK: add i64 {{%.*}}, {{%.*}}
   return _mm_add_si64(a, b);
 }
 
 __m64 test_mm_adds_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_adds_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.padds.b
+  // CHECK: call <8 x i8> @llvm.sadd.sat.v8i8(
   return _mm_adds_pi8(a, b);
 }
 
 __m64 test_mm_adds_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_adds_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.padds.w
+  // CHECK: call <4 x i16> @llvm.sadd.sat.v4i16(
   return _mm_adds_pi16(a, b);
 }
 
 __m64 test_mm_adds_pu8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_adds_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.paddus.b
+  // CHECK: call <8 x i8> @llvm.uadd.sat.v8i8(
   return _mm_adds_pu8(a, b);
 }
 
 __m64 test_mm_adds_pu16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_adds_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.paddus.w
+  // CHECK: call <4 x i16> @llvm.uadd.sat.v4i16(
   return _mm_adds_pu16(a, b);
 }
 
 __m64 test_mm_alignr_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_alignr_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.palignr.b
+  // CHECK: shufflevector <16 x i8> {{%.*}}, <16 x i8> zeroinitializer, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
   return _mm_alignr_pi8(a, b, 2);
 }
 
 __m64 test_mm_and_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_and_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.pand
+  // CHECK: and <1 x i64> {{%.*}}, {{%.*}}
   return _mm_and_si64(a, b);
 }
 
 __m64 test_mm_andnot_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_andnot_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.pandn
+  // CHECK: [[TMP:%.*]] = xor <1 x i64> {{%.*}}, <i64 -1>
+  // CHECK: and <1 x i64> [[TMP]], {{%.*}}
   return _mm_andnot_si64(a, b);
 }
 
 __m64 test_mm_avg_pu8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_avg_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pavg.b
+  // CHECK: call <16 x i8> @llvm.x86.sse2.pavg.b(
   return _mm_avg_pu8(a, b);
 }
 
 __m64 test_mm_avg_pu16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_avg_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pavg.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pavg.w(
   return _mm_avg_pu16(a, b);
 }
 
 __m64 test_mm_cmpeq_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cmpeq_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.b
+  // CHECK:      [[CMP:%.*]] = icmp eq <8 x i8> {{%.*}}, {{%.*}}
+  // CHECK-NEXT: {{%.*}} = sext <8 x i1> [[CMP]] to <8 x i8>
   return _mm_cmpeq_pi8(a, b);
 }
 
 __m64 test_mm_cmpeq_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cmpeq_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.w
+  // CHECK:      [[CMP:%.*]] = icmp eq <4 x i16> {{%.*}}, {{%.*}}
+  // CHECK-NEXT: {{%.*}} = sext <4 x i1> [[CMP]] to <4 x i16>
   return _mm_cmpeq_pi16(a, b);
 }
 
 __m64 test_mm_cmpeq_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cmpeq_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.d
+  // CHECK:      [[CMP:%.*]] = icmp eq <2 x i32> {{%.*}}, {{%.*}}
+  // CHECK-NEXT: {{%.*}} = sext <2 x i1> [[CMP]] to <2 x i32>
   return _mm_cmpeq_pi32(a, b);
 }
 
 __m64 test_mm_cmpgt_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cmpgt_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.b
+  // CHECK:      [[CMP:%.*]] = icmp sgt <8 x i8> {{%.*}}, {{%.*}}
+  // CHECK-NEXT: {{%.*}} = sext <8 x i1> [[CMP]] to <8 x i8>
   return _mm_cmpgt_pi8(a, b);
 }
 
 __m64 test_mm_cmpgt_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cmpgt_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.w
+  // CHECK:      [[CMP:%.*]] = icmp sgt <4 x i16> {{%.*}}, {{%.*}}
+  // CHECK-NEXT: {{%.*}} = sext <4 x i1> [[CMP]] to <4 x i16>
   return _mm_cmpgt_pi16(a, b);
 }
 
 __m64 test_mm_cmpgt_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cmpgt_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.d
+  // CHECK:      [[CMP:%.*]] = icmp sgt <2 x i32> {{%.*}}, {{%.*}}
+  // CHECK-NEXT: {{%.*}} = sext <2 x i1> [[CMP]] to <2 x i32>
   return _mm_cmpgt_pi32(a, b);
 }
 
 __m128 test_mm_cvt_pi2ps(__m128 a, __m64 b) {
   // CHECK-LABEL: test_mm_cvt_pi2ps
-  // CHECK: <4 x float> @llvm.x86.sse.cvtpi2ps
+  // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float>
   return _mm_cvt_pi2ps(a, b);
 }
 
 __m64 test_mm_cvt_ps2pi(__m128 a) {
   // CHECK-LABEL: test_mm_cvt_ps2pi
-  // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq(
   return _mm_cvt_ps2pi(a);
 }
 
 __m64 test_mm_cvtpd_pi32(__m128d a) {
   // CHECK-LABEL: test_mm_cvtpd_pi32
-  // CHECK: call x86_mmx @llvm.x86.sse.cvtpd2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtpd2dq(
   return _mm_cvtpd_pi32(a);
 }
 
 __m128 test_mm_cvtpi16_ps(__m64 a) {
   // CHECK-LABEL: test_mm_cvtpi16_ps
-  // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
+  // CHECK: sitofp <4 x i16> {{%.*}} to <4 x float>
   return _mm_cvtpi16_ps(a);
 }
 
 __m128d test_mm_cvtpi32_pd(__m64 a) {
   // CHECK-LABEL: test_mm_cvtpi32_pd
-  // CHECK: call <2 x double> @llvm.x86.sse.cvtpi2pd
+  // CHECK: sitofp <2 x i32> {{%.*}} to <2 x double>
   return _mm_cvtpi32_pd(a);
 }
 
 __m128 test_mm_cvtpi32_ps(__m128 a, __m64 b) {
   // CHECK-LABEL: test_mm_cvtpi32_ps
-  // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
+  // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float>
   return _mm_cvtpi32_ps(a, b);
 }
 
 __m128 test_mm_cvtpi32x2_ps(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cvtpi32x2_ps
-  // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
-  // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
+  // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float>
   return _mm_cvtpi32x2_ps(a, b);
 }
 
 __m64 test_mm_cvtps_pi16(__m128 a) {
   // CHECK-LABEL: test_mm_cvtps_pi16
-  // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
+  // CHECK: [[TMP0:%.*]] = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> {{%.*}})
+  // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[TMP0]],
   return _mm_cvtps_pi16(a);
 }
 
 __m64 test_mm_cvtps_pi32(__m128 a) {
   // CHECK-LABEL: test_mm_cvtps_pi32
-  // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq(
   return _mm_cvtps_pi32(a);
 }
 
@@ -205,19 +212,19 @@ int test_mm_cvtsi64_si32(__m64 a) {
 
 __m64 test_mm_cvttpd_pi32(__m128d a) {
   // CHECK-LABEL: test_mm_cvttpd_pi32
-  // CHECK: call x86_mmx @llvm.x86.sse.cvttpd2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvttpd2dq(
   return _mm_cvttpd_pi32(a);
 }
 
 __m64 test_mm_cvttps_pi32(__m128 a) {
   // CHECK-LABEL: test_mm_cvttps_pi32
-  // CHECK: call x86_mmx @llvm.x86.sse.cvttps2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvttps2dq(
   return _mm_cvttps_pi32(a);
 }
 
 int test_mm_extract_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_extract_pi16
-  // CHECK: call i32 @llvm.x86.mmx.pextr.w
+  // CHECK: extractelement <4 x i16> {{%.*}}, i64 2
   return _mm_extract_pi16(a, 2);
 }
 
@@ -234,151 +241,153 @@ __m64 test_m_from_int64(long long a) {
 
 __m64 test_mm_hadd_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadd_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.w
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(
   return _mm_hadd_pi16(a, b);
 }
 
 __m64 test_mm_hadd_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadd_pi32
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.d
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(
   return _mm_hadd_pi32(a, b);
 }
 
 __m64 test_mm_hadds_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadds_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.sw
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(
   return _mm_hadds_pi16(a, b);
 }
 
 __m64 test_mm_hsub_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.w
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(
   return _mm_hsub_pi16(a, b);
 }
 
 __m64 test_mm_hsub_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi32
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.d
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(
   return _mm_hsub_pi32(a, b);
 }
 
 __m64 test_mm_hsubs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsubs_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.sw
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(
   return _mm_hsubs_pi16(a, b);
 }
 
 __m64 test_mm_insert_pi16(__m64 a, int d) {
   // CHECK-LABEL: test_mm_insert_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pinsr.w
+  // CHECK: insertelement <4 x i16>
   return _mm_insert_pi16(a, d, 2);
 }
 
 __m64 test_mm_madd_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_madd_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmadd.wd
+  // CHECK: call <4 x i32> @llvm.x86.sse2.pmadd.wd(
   return _mm_madd_pi16(a, b);
 }
 
 __m64 test_mm_maddubs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_maddubs_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(
   return _mm_maddubs_pi16(a, b);
 }
 
 void test_mm_maskmove_si64(__m64 d, __m64 n, char *p) {
   // CHECK-LABEL: test_mm_maskmove_si64
-  // CHECK: call void @llvm.x86.mmx.maskmovq
+  // CHECK: call void @llvm.x86.sse2.maskmov.dqu(
   _mm_maskmove_si64(d, n, p);
 }
 
 __m64 test_mm_max_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_max_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmaxs.w
+  // CHECK: call <4 x i16> @llvm.smax.v4i16(
   return _mm_max_pi16(a, b);
 }
 
 __m64 test_mm_max_pu8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_max_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmaxu.b
+  // CHECK: call <8 x i8> @llvm.umax.v8i8(
   return _mm_max_pu8(a, b);
 }
 
 __m64 test_mm_min_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_min_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmins.w
+  // CHECK: call <4 x i16> @llvm.smin.v4i16(
   return _mm_min_pi16(a, b);
 }
 
 __m64 test_mm_min_pu8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_min_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pminu.b
+  // CHECK: call <8 x i8> @llvm.umin.v8i8(
   return _mm_min_pu8(a, b);
 }
 
 int test_mm_movemask_pi8(__m64 a) {
   // CHECK-LABEL: test_mm_movemask_pi8
-  // CHECK: call i32 @llvm.x86.mmx.pmovmskb
+  // CHECK: call i32 @llvm.x86.sse2.pmovmskb.128(
   return _mm_movemask_pi8(a);
 }
 
 __m64 test_mm_mul_su32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mul_su32
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+  // CHECK: and <2 x i64> {{%.*}}, <i64 4294967295, i64 4294967295>
+  // CHECK: and <2 x i64> {{%.*}}, <i64 4294967295, i64 4294967295>
+  // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_su32(a, b);
 }
 
 __m64 test_mm_mulhi_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mulhi_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmulh.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pmulh.w(
   return _mm_mulhi_pi16(a, b);
 }
 
 __m64 test_mm_mulhi_pu16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mulhi_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmulhu.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pmulhu.w(
   return _mm_mulhi_pu16(a, b);
 }
 
 __m64 test_mm_mulhrs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mulhrs_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pmul.hr.sw
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(
   return _mm_mulhrs_pi16(a, b);
 }
 
 __m64 test_mm_mullo_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mullo_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmull.w
+  // CHECK: mul <4 x i16> {{%.*}}, {{%.*}}
   return _mm_mullo_pi16(a, b);
 }
 
 __m64 test_mm_or_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_or_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.por
+  // CHECK: or <1 x i64> {{%.*}}, {{%.*}}
   return _mm_or_si64(a, b);
 }
 
 __m64 test_mm_packs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_packs_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.packsswb
+  // CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128(
   return _mm_packs_pi16(a, b);
 }
 
 __m64 test_mm_packs_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_packs_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.packssdw
+  // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(
   return _mm_packs_pi32(a, b);
 }
 
 __m64 test_mm_packs_pu16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_packs_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.packuswb
+  // CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128(
   return _mm_packs_pu16(a, b);
 }
 
 __m64 test_mm_sad_pu8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sad_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.psad.bw
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>
   return _mm_sad_pu8(a, b);
 }
 
@@ -471,187 +480,187 @@ __m64 test_mm_set1_pi32(int a) {
 
 __m64 test_mm_shuffle_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_shuffle_pi8
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pshuf.b
+  // CHECK: call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(
   return _mm_shuffle_pi8(a, b);
 }
 
 __m64 test_mm_shuffle_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_shuffle_pi16
-  // CHECK: call x86_mmx @llvm.x86.sse.pshuf.w
+  // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   return _mm_shuffle_pi16(a, 3);
 }
 
 __m64 test_mm_sign_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sign_pi8
-  // CHECK: call x86_mmx @llvm.x86.ssse3.psign.b
+  // CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128(
   return _mm_sign_pi8(a, b);
 }
 
 __m64 test_mm_sign_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sign_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.psign.w
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.psign.w.128(
   return _mm_sign_pi16(a, b);
 }
 
 __m64 test_mm_sign_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sign_pi32
-  // CHECK: call x86_mmx @llvm.x86.ssse3.psign.d
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.psign.d.128(
   return _mm_sign_pi32(a, b);
 }
 
 __m64 test_mm_sll_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sll_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psll.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psll.w(
   return _mm_sll_pi16(a, b);
 }
 
 __m64 test_mm_sll_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sll_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psll.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psll.d(
   return _mm_sll_pi32(a, b);
 }
 
 __m64 test_mm_sll_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sll_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.psll.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psll.q(
   return _mm_sll_si64(a, b);
 }
 
 __m64 test_mm_slli_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_slli_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pslli.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pslli.w(
   return _mm_slli_pi16(a, 3);
 }
 
 __m64 test_mm_slli_pi32(__m64 a) {
   // CHECK-LABEL: test_mm_slli_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.pslli.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(
   return _mm_slli_pi32(a, 3);
 }
 
 __m64 test_mm_slli_si64(__m64 a) {
   // CHECK-LABEL: test_mm_slli_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.pslli.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.pslli.q(
   return _mm_slli_si64(a, 3);
 }
 
 __m64 test_mm_sra_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sra_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psra.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psra.w(
   return _mm_sra_pi16(a, b);
 }
 
 __m64 test_mm_sra_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sra_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psra.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psra.d(
   return _mm_sra_pi32(a, b);
 }
 
 __m64 test_mm_srai_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_srai_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrai.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psrai.w(
   return _mm_srai_pi16(a, 3);
 }
 
 __m64 test_mm_srai_pi32(__m64 a) {
   // CHECK-LABEL: test_mm_srai_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrai.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psrai.d(
   return _mm_srai_pi32(a, 3);
 }
 
 __m64 test_mm_srl_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_srl_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrl.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psrl.w(
   return _mm_srl_pi16(a, b);
 }
 
 __m64 test_mm_srl_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_srl_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrl.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psrl.d(
   return _mm_srl_pi32(a, b);
 }
 
 __m64 test_mm_srl_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_srl_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrl.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psrl.q(
   return _mm_srl_si64(a, b);
 }
 
 __m64 test_mm_srli_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_srli_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrli.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psrli.w(
   return _mm_srli_pi16(a, 3);
 }
 
 __m64 test_mm_srli_pi32(__m64 a) {
   // CHECK-LABEL: test_mm_srli_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrli.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psrli.d(
   return _mm_srli_pi32(a, 3);
 }
 
 __m64 test_mm_srli_si64(__m64 a) {
   // CHECK-LABEL: test_mm_srli_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrli.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psrli.q(
   return _mm_srli_si64(a, 3);
 }
 
 void test_mm_stream_pi(__m64 *p, __m64 a) {
   // CHECK-LABEL: test_mm_stream_pi
-  // CHECK: call void @llvm.x86.mmx.movnt.dq
+  // CHECK: store <1 x i64> {{%.*}}, ptr {{%.*}}, align 8, !nontemporal
   _mm_stream_pi(p, a);
 }
 
 void test_mm_stream_pi_void(void *p, __m64 a) {
   // CHECK-LABEL: test_mm_stream_pi_void
-  // CHECK: call void @llvm.x86.mmx.movnt.dq
+  // CHECK: store <1 x i64> {{%.*}}, ptr {{%.*}}, align 8, !nontemporal
   _mm_stream_pi(p, a);
 }
 
 __m64 test_mm_sub_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sub_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.psub.b
+  // CHECK: sub <8 x i8> {{%.*}}, {{%.*}}
   return _mm_sub_pi8(a, b);
 }
 
 __m64 test_mm_sub_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sub_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psub.w
+  // CHECK: sub <4 x i16> {{%.*}}, {{%.*}}
   return _mm_sub_pi16(a, b);
 }
 
 __m64 test_mm_sub_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sub_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psub.d
+  // CHECK: sub <2 x i32> {{%.*}}, {{%.*}}
   return _mm_sub_pi32(a, b);
 }
 
 __m64 test_mm_sub_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sub_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+  // CHECK: sub i64 {{%.*}}, {{%.*}}
   return _mm_sub_si64(a, b);
 }
 
 __m64 test_mm_subs_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_subs_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.psubs.b
+  // CHECK: call <8 x i8> @llvm.ssub.sat.v8i8(
   return _mm_subs_pi8(a, b);
 }
 
 __m64 test_mm_subs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_subs_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psubs.w
+  // CHECK: call <4 x i16> @llvm.ssub.sat.v4i16(
   return _mm_subs_pi16(a, b);
 }
 
 __m64 test_mm_subs_pu8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_subs_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.psubus.b
+  // CHECK: call <8 x i8> @llvm.usub.sat.v8i8(
   return _mm_subs_pu8(a, b);
 }
 
 __m64 test_mm_subs_pu16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_subs_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psubus.w
+  // CHECK: call <4 x i16> @llvm.usub.sat.v4i16(
   return _mm_subs_pu16(a, b);
 }
 
@@ -668,42 +677,42 @@ long long test_m_to_int64(__m64 a) {
 
 __m64 test_mm_unpackhi_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_unpackhi_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpckhbw
+  // CHECK: shufflevector <8 x i8> {{%.*}}, <8 x i8> {{%.*}}, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   return _mm_unpackhi_pi8(a, b);
 }
 
 __m64 test_mm_unpackhi_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_unpackhi_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpckhwd
+  // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   return _mm_unpackhi_pi16(a, b);
 }
 
 __m64 test_mm_unpackhi_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_unpackhi_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpckhdq
+  // CHECK: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> <i32 1, i32 3>
   return _mm_unpackhi_pi32(a, b);
 }
 
 __m64 test_mm_unpacklo_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_unpacklo_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpcklbw
+  // CHECK: shufflevector <8 x i8> {{%.*}}, <8 x i8> {{%.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   return _mm_unpacklo_pi8(a, b);
 }
 
 __m64 test_mm_unpacklo_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_unpacklo_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpcklwd
+  // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   return _mm_unpacklo_pi16(a, b);
 }
 
 __m64 test_mm_unpacklo_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_unpacklo_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpckldq
+  // CHECK: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> <i32 0, i32 2>
   return _mm_unpacklo_pi32(a, b);
 }
 
 __m64 test_mm_xor_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_xor_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.pxor
+  // CHECK: xor <1 x i64> {{%.*}}, {{%.*}}
   return _mm_xor_si64(a, b);
 }
diff --git a/clang/test/CodeGen/X86/mmx-inline-asm.c b/clang/test/CodeGen/X86/mmx-inline-asm.c
index a0702c7f780d1..17fce1a48755e 100644
--- a/clang/test/CodeGen/X86/mmx-inline-asm.c
+++ b/clang/test/CodeGen/X86/mmx-inline-asm.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -emit-llvm -triple i386 -target-feature +mmx %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -triple i386 -target-feature +mmx -target-feature +sse2 %s -o - | FileCheck %s
 #include <mmintrin.h>
 
 // CHECK: { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }
diff --git a/clang/test/CodeGen/X86/mmx-shift-with-immediate.c b/clang/test/CodeGen/X86/mmx-shift-with-immediate.c
index 83be6b5517c01..741cb9c9c5ecf 100644
--- a/clang/test/CodeGen/X86/mmx-shift-with-immediate.c
+++ b/clang/test/CodeGen/X86/mmx-shift-with-immediate.c
@@ -1,23 +1,23 @@
-// RUN: %clang_cc1 -emit-llvm -triple i386 -target-feature +mmx %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -triple i386 -target-feature +sse2 %s -o - | FileCheck %s
 #include <mmintrin.h>
 
 void shift(__m64 a, __m64 b, int c) {
-  // CHECK: x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %{{.*}}, i32 {{.*}})
   _mm_slli_pi16(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 {{.*}})
   _mm_slli_pi32(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %{{.*}}, i32 {{.*}})
   _mm_slli_si64(a, c);
 
-  // CHECK: x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %{{.*}}, i32 {{.*}})
   _mm_srli_pi16(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %{{.*}}, i32 {{.*}})
   _mm_srli_pi32(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %{{.*}}, i32 {{.*}})
   _mm_srli_si64(a, c);
 
-  // CHECK: x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %{{.*}}, i32 {{.*}})
   _mm_srai_pi16(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %{{.*}}, i32 {{.*}})
   _mm_srai_pi32(a, c);
 }
diff --git a/clang/test/CodeGen/attr-target-x86-mmx.c b/clang/test/CodeGen/attr-target-x86-mmx.c
index 01663766d9842..39b26619475af 100644
--- a/clang/test/CodeGen/attr-target-x86-mmx.c
+++ b/clang/test/CodeGen/attr-target-x86-mmx.c
@@ -1,12 +1,11 @@
 // RUN: %clang_cc1 -triple i386-linux-gnu -emit-llvm %s -o - | FileCheck %s
-// Picking a cpu that doesn't have mmx or sse by default so we can enable it later.
+// Picking a cpu that doesn't have sse by default so we can enable it later.
 
 #define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
-// Verify that when we turn on sse that we also turn on mmx.
-void __attribute__((target("sse"))) shift(__m64 a, __m64 b, int c) {
+void __attribute__((target("sse2"))) shift(__m64 a, __m64 b, int c) {
   _mm_slli_pi16(a, c);
   _mm_slli_pi32(a, c);
   _mm_slli_si64(a, c);
@@ -19,4 +18,4 @@ void __attribute__((target("sse"))) shift(__m64 a, __m64 b, int c) {
   _mm_srai_pi32(a, c);
 }
 
-// CHECK: "target-features"="+cx8,+mmx,+sse,+x87"
+// CHECK: "target-features"="+cx8,+mmx,+sse,+sse2,+x87"
diff --git a/clang/test/CodeGen/builtins-x86.c b/clang/test/CodeGen/builtins-x86.c
index de31a4db5b0c1..c42c3216ec53c 100644
--- a/clang/test/CodeGen/builtins-x86.c
+++ b/clang/test/CodeGen/builtins-x86.c
@@ -168,26 +168,6 @@ void f0(void) {
   tmp_V4f = __builtin_ia32_minss(tmp_V4f, tmp_V4f);
   tmp_V4f = __builtin_ia32_maxss(tmp_V4f, tmp_V4f);
 
-  tmp_V8c = __builtin_ia32_paddsb(tmp_V8c, tmp_V8c);
-  tmp_V4s = __builtin_ia32_paddsw(tmp_V4s, tmp_V4s);
-  tmp_V8c = __builtin_ia32_psubsb(tmp_V8c, tmp_V8c);
-  tmp_V4s = __builtin_ia32_psubsw(tmp_V4s, tmp_V4s);
-  tmp_V8c = __builtin_ia32_paddusb(tmp_V8c, tmp_V8c);
-  tmp_V4s = __builtin_ia32_paddusw(tmp_V4s, tmp_V4s);
-  tmp_V8c = __builtin_ia32_psubusb(tmp_V8c, tmp_V8c);
-  tmp_V4s = __builtin_ia32_psubusw(tmp_V4s, tmp_V4s);
-  tmp_V4s = __builtin_ia32_pmulhw(tmp_V4s, tmp_V4s);
-  tmp_V4s = __builtin_ia32_pmulhuw(tmp_V4s, tmp_V4s);
-  tmp_V8c = __builtin_ia32_pcmpeqb(tmp_V8c, tmp_V8c);
-  tmp_V4s = __builtin_ia32_pcmpeqw(tmp_V4s, tmp_V4s);
-  tmp_V2i = __builtin_ia32_pcmpeqd(tmp_V2i, tmp_V2i);
-  tmp_V8c = __builtin_ia32_pcmpgtb(tmp_V8c, tmp_V8c);
-  tmp_V4s = __builtin_ia32_pcmpgtw(tmp_V4s, tmp_V4s);
-  tmp_V2i = __builtin_ia32_pcmpgtd(tmp_V2i, tmp_V2i);
-  tmp_V8c = __builtin_ia32_pmaxub(tmp_V8c, tmp_V8c);
-  tmp_V4s = __builtin_ia32_pmaxsw(tmp_V4s, tmp_V4s);
-  tmp_V8c = __builtin_ia32_pminub(tmp_V8c, tmp_V8c);
-  tmp_V4s = __builtin_ia32_pminsw(tmp_V4s, tmp_V4s);
   tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 0);
   tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 1);
   tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 2);
@@ -220,45 +200,17 @@ void f0(void) {
   tmp_V4f = __builtin_ia32_hsubps(tmp_V4f, tmp_V4f);
   tmp_V2d = __builtin_ia32_hsubpd(tmp_V2d, tmp_V2d);
   tmp_V8s = __builtin_ia32_phaddw128(tmp_V8s, tmp_V8s);
-  tmp_V4s = __builtin_ia32_phaddw(tmp_V4s, tmp_V4s);
   tmp_V4i = __builtin_ia32_phaddd128(tmp_V4i, tmp_V4i);
-  tmp_V2i = __builtin_ia32_phaddd(tmp_V2i, tmp_V2i);
   tmp_V8s = __builtin_ia32_phaddsw128(tmp_V8s, tmp_V8s);
-  tmp_V4s = __builtin_ia32_phaddsw(tmp_V4s, tmp_V4s);
   tmp_V8s = __builtin_ia32_phsubw128(tmp_V8s, tmp_V8s);
-  tmp_V4s = __builtin_ia32_phsubw(tmp_V4s, tmp_V4s);
   tmp_V4i = __builtin_ia32_phsubd128(tmp_V4i, tmp_V4i);
-  tmp_V2i = __builtin_ia32_phsubd(tmp_V2i, tmp_V2i);
   tmp_V8s = __builtin_ia32_phsubsw128(tmp_V8s, tmp_V8s);
-  tmp_V4s = __builtin_ia32_phsubsw(tmp_V4s, tmp_V4s);
   tmp_V8s = __builtin_ia32_pmaddubsw128(tmp_V16c, tmp_V16c);
-  tmp_V8c = __builtin_ia32_pmaddubsw(tmp_V8c, tmp_V8c);
   tmp_V8s = __builtin_ia32_pmulhrsw128(tmp_V8s, tmp_V8s);
-  tmp_V4s = __builtin_ia32_pmulhrsw(tmp_V4s, tmp_V4s);
   tmp_V16c = __builtin_ia32_pshufb128(tmp_V16c, tmp_V16c);
-  tmp_V8c = __builtin_ia32_pshufb(tmp_V8c, tmp_V8c);
   tmp_V16c = __builtin_ia32_psignb128(tmp_V16c, tmp_V16c);
-  tmp_V8c = __builtin_ia32_psignb(tmp_V8c, tmp_V8c);
   tmp_V8s = __builtin_ia32_psignw128(tmp_V8s, tmp_V8s);
-  tmp_V4s = __builtin_ia32_psignw(tmp_V4s, tmp_V4s);
   tmp_V4i = __builtin_ia32_psignd128(tmp_V4i, tmp_V4i);
-  tmp_V2i = __builtin_ia32_psignd(tmp_V2i, tmp_V2i);
-  tmp_V8c = __builtin_ia32_pabsb(tmp_V8c);
-  tmp_V4s = __builtin_ia32_pabsw(tmp_V4s);
-  tmp_V2i = __builtin_ia32_pabsd(tmp_V2i);
-  tmp_V4s = __builtin_ia32_psllw(tmp_V4s, tmp_V1LLi);
-  tmp_V2i = __builtin_ia32_pslld(tmp_V2i, tmp_V1LLi);
-  tmp_V1LLi = __builtin_ia32_psllq(tmp_V1LLi, tmp_V1LLi);
-  tmp_V4s = __builtin_ia32_psrlw(tmp_V4s, tmp_V1LLi);
-  tmp_V2i = __builtin_ia32_psrld(tmp_V2i, tmp_V1LLi);
-  tmp_V1LLi = __builtin_ia32_psrlq(tmp_V1LLi, tmp_V1LLi);
-  tmp_V4s = __builtin_ia32_psraw(tmp_V4s, tmp_V1LLi);
-  tmp_V2i = __builtin_ia32_psrad(tmp_V2i, tmp_V1LLi);
-  tmp_V2i = __builtin_ia32_pmaddwd(tmp_V4s, tmp_V4s);
-  tmp_V8c = __builtin_ia32_packsswb(tmp_V4s, tmp_V4s);
-  tmp_V4s = __builtin_ia32_packssdw(tmp_V2i, tmp_V2i);
-  tmp_V8c = __builtin_ia32_packuswb(tmp_V4s, tmp_V4s);
-  tmp_i = __builtin_ia32_vec_ext_v2si(tmp_V2i, 0);
 
   __builtin_ia32_incsspd(tmp_Ui);
   __builtin_ia32_incsspq(tmp_ULLi);
@@ -306,8 +258,6 @@ void f0(void) {
   (void) __builtin_ia32_clzero(tmp_vp);
   (void) __builtin_ia32_cldemote(tmp_vp);
 
-  tmp_V4f = __builtin_ia32_cvtpi2ps(tmp_V4f, tmp_V2i);
-  tmp_V2i = __builtin_ia32_cvtps2pi(tmp_V4f);
   tmp_i = __builtin_ia32_cvtss2si(tmp_V4f);
   tmp_i = __builtin_ia32_cvttss2si(tmp_V4f);
 
@@ -320,17 +270,12 @@ void f0(void) {
   tmp_LLi = __builtin_ia32_cvtss2si64(tmp_V4f);
   tmp_LLi = __builtin_ia32_cvttss2si64(tmp_V4f);
 #endif
-  tmp_V2i = __builtin_ia32_cvttps2pi(tmp_V4f);
-  (void) __builtin_ia32_maskmovq(tmp_V8c, tmp_V8c, tmp_cp);
   tmp_i = __builtin_ia32_movmskps(tmp_V4f);
-  tmp_i = __builtin_ia32_pmovmskb(tmp_V8c);
-  (void) __builtin_ia32_movntq(tmp_V1LLip, tmp_V1LLi);
   (void) __builtin_ia32_sfence();
 #ifndef OPENCL
   (void) _mm_sfence();
 #endif
 
-  tmp_V4s = __builtin_ia32_psadbw(tmp_V8c, tmp_V8c);
   tmp_V4f = __builtin_ia32_rcpps(tmp_V4f);
   tmp_V4f = __builtin_ia32_rcpss(tmp_V4f);
   tmp_V4f = __builtin_ia32_rsqrtps(tmp_V4f);
@@ -348,11 +293,8 @@ void f0(void) {
   tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d);
   tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d);
   tmp_V2LLi = __builtin_ia32_cvtpd2dq(tmp_V2d);
-  tmp_V2i = __builtin_ia32_cvtpd2pi(tmp_V2d);
   tmp_V4f = __builtin_ia32_cvtpd2ps(tmp_V2d);
   tmp_V4i = __builtin_ia32_cvttpd2dq(tmp_V2d);
-  tmp_V2i = __builtin_ia32_cvttpd2pi(tmp_V2d);
-  tmp_V2d = __builtin_ia32_cvtpi2pd(tmp_V2i);
   tmp_i = __builtin_ia32_cvtsd2si(tmp_V2d);
   tmp_i = __builtin_ia32_cvttsd2si(tmp_V2d);
   tmp_V4f = __builtin_ia32_cvtsd2ss(tmp_V4f, tmp_V2d);
@@ -379,26 +321,9 @@ void f0(void) {
   (void) _mm_pause();
 #endif
 
-  tmp_V4s = __builtin_ia32_psllwi(tmp_V4s, imm_i_0_8);
-  tmp_V2i = __builtin_ia32_pslldi(tmp_V2i, imm_i_0_8);
-  tmp_V1LLi = __builtin_ia32_psllqi(tmp_V1LLi, imm_i_0_8);
-  tmp_V4s = __builtin_ia32_psrawi(tmp_V4s, imm_i_0_8);
-  tmp_V2i = __builtin_ia32_psradi(tmp_V2i, imm_i_0_8);
-  tmp_V4s = __builtin_ia32_psrlwi(tmp_V4s, imm_i_0_8);
-  tmp_V2i = __builtin_ia32_psrldi(tmp_V2i, imm_i_0_8);
-  tmp_V1LLi = __builtin_ia32_psrlqi(tmp_V1LLi, imm_i_0_8);
 
   // Using non-immediate argument supported for gcc compatibility
-  tmp_V4s = __builtin_ia32_psllwi(tmp_V4s, tmp_i);
-  tmp_V2i = __builtin_ia32_pslldi(tmp_V2i, tmp_i);
-  tmp_V1LLi = __builtin_ia32_psllqi(tmp_V1LLi, tmp_i);
-  tmp_V4s = __builtin_ia32_psrawi(tmp_V4s, tmp_i);
-  tmp_V2i = __builtin_ia32_psradi(tmp_V2i, tmp_i);
-  tmp_V4s = __builtin_ia32_psrlwi(tmp_V4s, tmp_i);
-  tmp_V2i = __builtin_ia32_psrldi(tmp_V2i, tmp_i);
-  tmp_V1LLi = __builtin_ia32_psrlqi(tmp_V1LLi, tmp_i);
-
-  tmp_V1LLi = __builtin_ia32_pmuludq(tmp_V2i, tmp_V2i);
+
   tmp_V2LLi = __builtin_ia32_pmuludq128(tmp_V4i, tmp_V4i);
   tmp_V8s = __builtin_ia32_psraw128(tmp_V8s, tmp_V8s);
   tmp_V4i = __builtin_ia32_psrad128(tmp_V4i, tmp_V4i);
@@ -433,7 +358,6 @@ void f0(void) {
   (void) __builtin_ia32_mwait(tmp_Ui, tmp_Ui);
   tmp_V16c = __builtin_ia32_lddqu(tmp_cCp);
   tmp_V16c = __builtin_ia32_palignr128(tmp_V16c, tmp_V16c, imm_i);
-  tmp_V8c = __builtin_ia32_palignr(tmp_V8c, tmp_V8c, imm_i);
 #ifdef USE_SSE4
   tmp_V16c = __builtin_ia32_pblendvb128(tmp_V16c, tmp_V16c, tmp_V16c);
   tmp_V2d = __builtin_ia32_blendvpd(tmp_V2d, tmp_V2d, tmp_V2d);
diff --git a/clang/test/CodeGen/palignr.c b/clang/test/CodeGen/palignr.c
index 5a77597c34031..092937ac115de 100644
--- a/clang/test/CodeGen/palignr.c
+++ b/clang/test/CodeGen/palignr.c
@@ -14,18 +14,3 @@ int4 align2(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 16); }
 int4 align3(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 17); }
 // CHECK: xor
 int4 align4(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 32); }
-
-#define _mm_alignr_pi8(a, b, n) (__builtin_ia32_palignr((a), (b), (n)))
-typedef __attribute__((vector_size(8))) int int2;
-
-// CHECK: palignr
-int2 align5(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 8); }
-
-// CHECK: palignr
-int2 align6(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 9); }
-
-// CHECK: palignr
-int2 align7(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 16); }
-
-// CHECK: palignr
-int2 align8(int2 a, int2 b) { return _mm_alignr_pi8(a, b, 7); }
diff --git a/clang/test/CodeGen/pr26099.c b/clang/test/CodeGen/pr26099.c
deleted file mode 100644
index 15b73b832e9d8..0000000000000
--- a/clang/test/CodeGen/pr26099.c
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: %clang_cc1 -ffreestanding %s -triple=i686-apple-darwin -target-feature +mmx -emit-llvm -o - -Wall -Werror
-// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +mmx -emit-llvm -o - -Wall -Werror
-// REQUIRES: asserts
-
-#include <x86intrin.h>
-
-int __attribute__ ((__vector_size__ (8))) b;
-
-void bar(int a)
-{
-  b = __builtin_ia32_vec_init_v2si (0, a);
-}
\ No newline at end of file
diff --git a/clang/test/Headers/xmmintrin.c b/clang/test/Headers/xmmintrin.c
index a75b3380368c0..15e4a431df65b 100644
--- a/clang/test/Headers/xmmintrin.c
+++ b/clang/test/Headers/xmmintrin.c
@@ -14,7 +14,7 @@ _MM_ALIGN16 char c;
 // checking that clang emits PACKSSDW instead of PACKSSWB.
 
 // CHECK: define{{.*}} i64 @test_mm_cvtps_pi16
-// CHECK: call x86_mmx @llvm.x86.mmx.packssdw
+// CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128
 
 __m64 test_mm_cvtps_pi16(__m128 a) {
   return _mm_cvtps_pi16(a);
diff --git a/clang/test/Sema/x86-builtin-palignr.c b/clang/test/Sema/x86-builtin-palignr.c
index e055cbb70e9e5..33a963c15b00d 100644
--- a/clang/test/Sema/x86-builtin-palignr.c
+++ b/clang/test/Sema/x86-builtin-palignr.c
@@ -4,5 +4,5 @@
 #include <tmmintrin.h>
 
 __m64 test1(__m64 a, __m64 b, int c) {
-   return _mm_alignr_pi8(a, b, c); // expected-error {{argument to '__builtin_ia32_palignr' must be a constant integer}}
+   return _mm_alignr_pi8(a, b, c); // expected-error {{argument to '__builtin_ia32_psrldqi128_byteshift' must be a constant integer}}
 }
diff --git a/clang/www/builtins.py b/clang/www/builtins.py
index 0c2e181a8cfa5..849e6bd4a816d 100755
--- a/clang/www/builtins.py
+++ b/clang/www/builtins.py
@@ -4,8 +4,9 @@
 
 err = 0
 
-# Giant associative set of builtin->intrinsic mappings where clang doesn't
-# implement the builtin since the vector operation works by default.
+# Giant associative set of builtin->intrinsic mappings where clang
+# doesn't implement the builtin. (Either because the vector operation
+# works without a builtin, or for other reasons.)
 
 repl_map = {
     "__builtin_ia32_addps": "_mm_add_ps",
@@ -134,6 +135,99 @@
     "__builtin_ia32_vec_ext_v2di": "_mm_extract_epi64",
     "__builtin_ia32_vec_ext_v4hi": "_mm_extract_pi16",
     "__builtin_ia32_vec_ext_v4sf": "_mm_extract_ps",
+    # Removed MMX builtins
+    "__builtin_ia32_paddb": "_mm_add_pi8",
+    "__builtin_ia32_paddw": "_mm_add_pi16",
+    "__builtin_ia32_paddd": "_mm_add_pi32",
+    "__builtin_ia32_paddsb": "_mm_adds_pi8",
+    "__builtin_ia32_paddsw": "_mm_adds_pi16",
+    "__builtin_ia32_paddusb": "_mm_adds_pu8",
+    "__builtin_ia32_paddusw": "_mm_adds_pu16",
+    "__builtin_ia32_psubb": "_mm_sub_pi8",
+    "__builtin_ia32_psubw": "_mm_sub_pi16",
+    "__builtin_ia32_psubd": "_mm_sub_pi32",
+    "__builtin_ia32_psubsb": "_mm_subs_pi8",
+    "__builtin_ia32_psubsw": "_mm_subs_pi16",
+    "__builtin_ia32_psubusb": "_mm_subs_pu8",
+    "__builtin_ia32_psubusw": "_mm_subs_pu16",
+    "__builtin_ia32_pmulhw": "_mm_mulhi_pi16",
+    "__builtin_ia32_pmullw": "_mm_mullo_pi16",
+    "__builtin_ia32_pmaddwd": "_mm_madd_pi16",
+    "__builtin_ia32_pand": "_mm_and_si64",
+    "__builtin_ia32_pandn": "_mm_andnot_si64",
+    "__builtin_ia32_por": "_mm_or_si64",
+    "__builtin_ia32_pxor": "_mm_xor_si64",
+    "__builtin_ia32_psllw": "_mm_sll_pi16",
+    "__builtin_ia32_pslld": "_mm_sll_pi32",
+    "__builtin_ia32_psllq": "_mm_sll_si64",
+    "__builtin_ia32_psrlw": "_mm_srl_pi16",
+    "__builtin_ia32_psrld": "_mm_srl_pi32",
+    "__builtin_ia32_psrlq": "_mm_srl_si64",
+    "__builtin_ia32_psraw": "_mm_sra_pi16",
+    "__builtin_ia32_psrad": "_mm_sra_pi32",
+    "__builtin_ia32_psllwi": "_mm_slli_pi16",
+    "__builtin_ia32_pslldi": "_mm_slli_pi32",
+    "__builtin_ia32_psllqi": "_mm_slli_si64",
+    "__builtin_ia32_psrlwi": "_mm_srli_pi16",
+    "__builtin_ia32_psrldi": "_mm_srli_pi32",
+    "__builtin_ia32_psrlqi": "_mm_srli_si64",
+    "__builtin_ia32_psrawi": "_mm_srai_pi16",
+    "__builtin_ia32_psradi": "_mm_srai_pi32",
+    "__builtin_ia32_packsswb": "_mm_packs_pi16",
+    "__builtin_ia32_packssdw": "_mm_packs_pi32",
+    "__builtin_ia32_packuswb": "_mm_packs_pu16",
+    "__builtin_ia32_punpckhbw": "_mm_unpackhi_pi8",
+    "__builtin_ia32_punpckhwd": "_mm_unpackhi_pi16",
+    "__builtin_ia32_punpckhdq": "_mm_unpackhi_pi32",
+    "__builtin_ia32_punpcklbw": "_mm_unpacklo_pi8",
+    "__builtin_ia32_punpcklwd": "_mm_unpacklo_pi16",
+    "__builtin_ia32_punpckldq": "_mm_unpacklo_pi32",
+    "__builtin_ia32_pcmpeqb": "_mm_cmpeq_pi8",
+    "__builtin_ia32_pcmpeqw": "_mm_cmpeq_pi16",
+    "__builtin_ia32_pcmpeqd": "_mm_cmpeq_pi32",
+    "__builtin_ia32_pcmpgtb": "_mm_cmpgt_pi8",
+    "__builtin_ia32_pcmpgtw": "_mm_cmpgt_pi16",
+    "__builtin_ia32_pcmpgtd": "_mm_cmpgt_pi32",
+    "__builtin_ia32_maskmovq": "_mm_maskmove_si64",
+    "__builtin_ia32_movntq": "_mm_stream_pi",
+    "__builtin_ia32_vec_init_v2si": "_mm_setr_pi32",
+    "__builtin_ia32_vec_init_v4hi": "_mm_setr_pi16",
+    "__builtin_ia32_vec_init_v8qi": "_mm_setr_pi8",
+    "__builtin_ia32_cvtpi2ps": "_mm_cvtpi32_ps",
+    "__builtin_ia32_cvtps2pi": "_mm_cvtps_pi32",
+    "__builtin_ia32_cvttps2pi": "_mm_cvttps_pi32",
+    "__builtin_ia32_pavgb": "_mm_avg_pu8",
+    "__builtin_ia32_pavgw": "_mm_avg_pu16",
+    "__builtin_ia32_pmaxsw": "_mm_max_pi16",
+    "__builtin_ia32_pmaxub": "_mm_max_pu8",
+    "__builtin_ia32_pminsw": "_mm_min_pi16",
+    "__builtin_ia32_pminub": "_mm_min_pu8",
+    "__builtin_ia32_pmovmskb": "_mm_movemask_pi8",
+    "__builtin_ia32_pmulhuw": "_mm_mulhi_pu16",
+    "__builtin_ia32_psadbw": "_mm_sad_pu8",
+    "__builtin_ia32_pshufw": "_mm_shuffle_pi16",
+    "__builtin_ia32_cvtpd2pi": "_mm_cvtpd_pi32",
+    "__builtin_ia32_cvtpi2pd": "_mm_cvtpi32_pd",
+    "__builtin_ia32_cvttpd2pi": "_mm_cvttpd_pi32",
+    "__builtin_ia32_paddq": "_mm_add_si64",
+    "__builtin_ia32_pmuludq": "_mm_mul_su32",
+    "__builtin_ia32_psubq": "_mm_sub_si64",
+    "__builtin_ia32_pabsb": "_mm_abs_pi8",
+    "__builtin_ia32_pabsd": "_mm_abs_pi32",
+    "__builtin_ia32_pabsw": "_mm_abs_pi16",
+    "__builtin_ia32_palignr": "_mm_alignr_pi8",
+    "__builtin_ia32_phaddd": "_mm_hadd_pi32",
+    "__builtin_ia32_phaddsw": "_mm_hadds_pi16",
+    "__builtin_ia32_phaddw": "_mm_hadd_pi16",
+    "__builtin_ia32_phsubd": "_mm_hsub_pi32",
+    "__builtin_ia32_phsubsw": "_mm_hsubs_pi16",
+    "__builtin_ia32_phsubw": "_mm_hsub_pi16",
+    "__builtin_ia32_pmaddubsw": "_mm_maddubs_pi16",
+    "__builtin_ia32_pmulhrsw": "_mm_mulhrs_pi16",
+    "__builtin_ia32_pshufb": "_mm_shuffle_pi8",
+    "__builtin_ia32_psignw": "_mm_sign_pi16",
+    "__builtin_ia32_psignb": "_mm_sign_pi8",
+    "__builtin_ia32_psignd": "_mm_sign_pi32",
 }
 
 # Special unhandled cases:
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index adc46f9789ebb..b6a92136f3828 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -222,11 +222,11 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse_cvttss2si64 : ClangBuiltin<"__builtin_ia32_cvttss2si64">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_v4f32_ty], [IntrNoMem]>;
 
-  def int_x86_sse_cvtps2pi : ClangBuiltin<"__builtin_ia32_cvtps2pi">,
+  def int_x86_sse_cvtps2pi :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_cvttps2pi: ClangBuiltin<"__builtin_ia32_cvttps2pi">,
+  def int_x86_sse_cvttps2pi:
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_x86_sse_cvtpi2ps : ClangBuiltin<"__builtin_ia32_cvtpi2ps">,
+  def int_x86_sse_cvtpi2ps :
       DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
 }
@@ -426,11 +426,11 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse2_cvtsd2ss : ClangBuiltin<"__builtin_ia32_cvtsd2ss">,
       DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                              llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse_cvtpd2pi : ClangBuiltin<"__builtin_ia32_cvtpd2pi">,
+  def int_x86_sse_cvtpd2pi :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse_cvttpd2pi: ClangBuiltin<"__builtin_ia32_cvttpd2pi">,
+  def int_x86_sse_cvttpd2pi:
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_v2f64_ty], [IntrNoMem]>;
-  def int_x86_sse_cvtpi2pd : ClangBuiltin<"__builtin_ia32_cvtpi2pd">,
+  def int_x86_sse_cvtpi2pd :
       DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
 }
 
@@ -512,49 +512,49 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Horizontal arithmetic ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_phadd_w         : ClangBuiltin<"__builtin_ia32_phaddw">,
+  def int_x86_ssse3_phadd_w         :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_phadd_w_128     : ClangBuiltin<"__builtin_ia32_phaddw128">,
       DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                              llvm_v8i16_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_phadd_d         : ClangBuiltin<"__builtin_ia32_phaddd">,
+  def int_x86_ssse3_phadd_d         :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_phadd_d_128     : ClangBuiltin<"__builtin_ia32_phaddd128">,
       DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
                              llvm_v4i32_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_phadd_sw        : ClangBuiltin<"__builtin_ia32_phaddsw">,
+  def int_x86_ssse3_phadd_sw        :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_phadd_sw_128    : ClangBuiltin<"__builtin_ia32_phaddsw128">,
       DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                              llvm_v8i16_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_phsub_w         : ClangBuiltin<"__builtin_ia32_phsubw">,
+  def int_x86_ssse3_phsub_w         :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_phsub_w_128     : ClangBuiltin<"__builtin_ia32_phsubw128">,
       DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                              llvm_v8i16_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_phsub_d         : ClangBuiltin<"__builtin_ia32_phsubd">,
+  def int_x86_ssse3_phsub_d         :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_phsub_d_128     : ClangBuiltin<"__builtin_ia32_phsubd128">,
       DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
                              llvm_v4i32_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_phsub_sw        : ClangBuiltin<"__builtin_ia32_phsubsw">,
+  def int_x86_ssse3_phsub_sw        :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_phsub_sw_128    : ClangBuiltin<"__builtin_ia32_phsubsw128">,
       DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                              llvm_v8i16_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_pmadd_ub_sw     : ClangBuiltin<"__builtin_ia32_pmaddubsw">,
+  def int_x86_ssse3_pmadd_ub_sw     :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_pmadd_ub_sw_128 : ClangBuiltin<"__builtin_ia32_pmaddubsw128">,
@@ -564,7 +564,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Packed multiply high with round and scale
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_pmul_hr_sw      : ClangBuiltin<"__builtin_ia32_pmulhrsw">,
+  def int_x86_ssse3_pmul_hr_sw      :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
   def int_x86_ssse3_pmul_hr_sw_128  : ClangBuiltin<"__builtin_ia32_pmulhrsw128">,
@@ -574,34 +574,34 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Shuffle ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_pshuf_b         : ClangBuiltin<"__builtin_ia32_pshufb">,
+  def int_x86_ssse3_pshuf_b         :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_pshuf_b_128     : ClangBuiltin<"__builtin_ia32_pshufb128">,
       DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
                              llvm_v16i8_ty], [IntrNoMem]>;
-  def int_x86_sse_pshuf_w           : ClangBuiltin<"__builtin_ia32_pshufw">,
+  def int_x86_sse_pshuf_w           :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i8_ty],
                              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
 
 // Sign ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_psign_b         : ClangBuiltin<"__builtin_ia32_psignb">,
+  def int_x86_ssse3_psign_b         :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_psign_b_128     : ClangBuiltin<"__builtin_ia32_psignb128">,
       DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
                              llvm_v16i8_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_psign_w         : ClangBuiltin<"__builtin_ia32_psignw">,
+  def int_x86_ssse3_psign_w         :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_psign_w_128     : ClangBuiltin<"__builtin_ia32_psignw128">,
       DefaultAttrsIntrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
                              llvm_v8i16_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_psign_d         : ClangBuiltin<"__builtin_ia32_psignd">,
+  def int_x86_ssse3_psign_d         :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                              llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_ssse3_psign_d_128     : ClangBuiltin<"__builtin_ia32_psignd128">,
@@ -611,13 +611,13 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Absolute value ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_ssse3_pabs_b     : ClangBuiltin<"__builtin_ia32_pabsb">,
+  def int_x86_ssse3_pabs_b     :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_pabs_w     : ClangBuiltin<"__builtin_ia32_pabsw">,
+  def int_x86_ssse3_pabs_w     :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
 
-  def int_x86_ssse3_pabs_d     : ClangBuiltin<"__builtin_ia32_pabsd">,
+  def int_x86_ssse3_pabs_d     :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
 }
 
@@ -2260,118 +2260,118 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // Integer arithmetic ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   // Addition
-  def int_x86_mmx_padd_b : ClangBuiltin<"__builtin_ia32_paddb">,
+  def int_x86_mmx_padd_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_padd_w : ClangBuiltin<"__builtin_ia32_paddw">,
+  def int_x86_mmx_padd_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_padd_d : ClangBuiltin<"__builtin_ia32_paddd">,
+  def int_x86_mmx_padd_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_padd_q : ClangBuiltin<"__builtin_ia32_paddq">,
+  def int_x86_mmx_padd_q :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 
-  def int_x86_mmx_padds_b : ClangBuiltin<"__builtin_ia32_paddsb">,
+  def int_x86_mmx_padds_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_padds_w : ClangBuiltin<"__builtin_ia32_paddsw">,
+  def int_x86_mmx_padds_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 
-  def int_x86_mmx_paddus_b : ClangBuiltin<"__builtin_ia32_paddusb">,
+  def int_x86_mmx_paddus_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_paddus_w : ClangBuiltin<"__builtin_ia32_paddusw">,
+  def int_x86_mmx_paddus_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 
   // Subtraction
-  def int_x86_mmx_psub_b : ClangBuiltin<"__builtin_ia32_psubb">,
+  def int_x86_mmx_psub_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psub_w : ClangBuiltin<"__builtin_ia32_psubw">,
+  def int_x86_mmx_psub_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psub_d : ClangBuiltin<"__builtin_ia32_psubd">,
+  def int_x86_mmx_psub_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psub_q : ClangBuiltin<"__builtin_ia32_psubq">,
+  def int_x86_mmx_psub_q :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
 
-  def int_x86_mmx_psubs_b : ClangBuiltin<"__builtin_ia32_psubsb">,
+  def int_x86_mmx_psubs_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psubs_w : ClangBuiltin<"__builtin_ia32_psubsw">,
+  def int_x86_mmx_psubs_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
 
-  def int_x86_mmx_psubus_b : ClangBuiltin<"__builtin_ia32_psubusb">,
+  def int_x86_mmx_psubus_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psubus_w : ClangBuiltin<"__builtin_ia32_psubusw">,
+  def int_x86_mmx_psubus_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
 
   // Multiplication
-  def int_x86_mmx_pmulh_w : ClangBuiltin<"__builtin_ia32_pmulhw">,
+  def int_x86_mmx_pmulh_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmull_w : ClangBuiltin<"__builtin_ia32_pmullw">,
+  def int_x86_mmx_pmull_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmulhu_w : ClangBuiltin<"__builtin_ia32_pmulhuw">,
+  def int_x86_mmx_pmulhu_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmulu_dq : ClangBuiltin<"__builtin_ia32_pmuludq">,
+  def int_x86_mmx_pmulu_dq :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmadd_wd : ClangBuiltin<"__builtin_ia32_pmaddwd">,
+  def int_x86_mmx_pmadd_wd :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 
   // Bitwise operations
-  def int_x86_mmx_pand : ClangBuiltin<"__builtin_ia32_pand">,
+  def int_x86_mmx_pand :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pandn : ClangBuiltin<"__builtin_ia32_pandn">,
+  def int_x86_mmx_pandn :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_por : ClangBuiltin<"__builtin_ia32_por">,
+  def int_x86_mmx_por :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pxor : ClangBuiltin<"__builtin_ia32_pxor">,
+  def int_x86_mmx_pxor :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 
   // Averages
-  def int_x86_mmx_pavg_b : ClangBuiltin<"__builtin_ia32_pavgb">,
+  def int_x86_mmx_pavg_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pavg_w : ClangBuiltin<"__builtin_ia32_pavgw">,
+  def int_x86_mmx_pavg_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 
   // Maximum
-  def int_x86_mmx_pmaxu_b : ClangBuiltin<"__builtin_ia32_pmaxub">,
+  def int_x86_mmx_pmaxu_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmaxs_w : ClangBuiltin<"__builtin_ia32_pmaxsw">,
+  def int_x86_mmx_pmaxs_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 
   // Minimum
-  def int_x86_mmx_pminu_b : ClangBuiltin<"__builtin_ia32_pminub">,
+  def int_x86_mmx_pminu_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pmins_w : ClangBuiltin<"__builtin_ia32_pminsw">,
+  def int_x86_mmx_pmins_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 
   // Packed sum of absolute differences
-  def int_x86_mmx_psad_bw : ClangBuiltin<"__builtin_ia32_psadbw">,
+  def int_x86_mmx_psad_bw :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 }
@@ -2379,58 +2379,58 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // Integer shift ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   // Shift left logical
-  def int_x86_mmx_psll_w : ClangBuiltin<"__builtin_ia32_psllw">,
+  def int_x86_mmx_psll_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psll_d : ClangBuiltin<"__builtin_ia32_pslld">,
+  def int_x86_mmx_psll_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psll_q : ClangBuiltin<"__builtin_ia32_psllq">,
+  def int_x86_mmx_psll_q :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
 
-  def int_x86_mmx_psrl_w : ClangBuiltin<"__builtin_ia32_psrlw">,
+  def int_x86_mmx_psrl_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psrl_d : ClangBuiltin<"__builtin_ia32_psrld">,
+  def int_x86_mmx_psrl_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psrl_q : ClangBuiltin<"__builtin_ia32_psrlq">,
+  def int_x86_mmx_psrl_q :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
 
-  def int_x86_mmx_psra_w : ClangBuiltin<"__builtin_ia32_psraw">,
+  def int_x86_mmx_psra_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psra_d : ClangBuiltin<"__builtin_ia32_psrad">,
+  def int_x86_mmx_psra_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
 
   // Oddly these don't require an immediate due to a gcc compatibility issue.
-  def int_x86_mmx_pslli_w : ClangBuiltin<"__builtin_ia32_psllwi">,
+  def int_x86_mmx_pslli_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_pslli_d : ClangBuiltin<"__builtin_ia32_pslldi">,
+  def int_x86_mmx_pslli_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_pslli_q : ClangBuiltin<"__builtin_ia32_psllqi">,
+  def int_x86_mmx_pslli_q :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                             [IntrNoMem]>;
 
-  def int_x86_mmx_psrli_w : ClangBuiltin<"__builtin_ia32_psrlwi">,
+  def int_x86_mmx_psrli_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psrli_d : ClangBuiltin<"__builtin_ia32_psrldi">,
+  def int_x86_mmx_psrli_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psrli_q : ClangBuiltin<"__builtin_ia32_psrlqi">,
+  def int_x86_mmx_psrli_q :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                             [IntrNoMem]>;
 
-  def int_x86_mmx_psrai_w : ClangBuiltin<"__builtin_ia32_psrawi">,
+  def int_x86_mmx_psrai_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_psrai_d : ClangBuiltin<"__builtin_ia32_psradi">,
+  def int_x86_mmx_psrai_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                             [IntrNoMem]>;
 }
@@ -2475,83 +2475,83 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 }
 // Pack ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_mmx_packsswb : ClangBuiltin<"__builtin_ia32_packsswb">,
+  def int_x86_mmx_packsswb :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_packssdw : ClangBuiltin<"__builtin_ia32_packssdw">,
+  def int_x86_mmx_packssdw :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_packuswb : ClangBuiltin<"__builtin_ia32_packuswb">,
+  def int_x86_mmx_packuswb :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
 }
 
 // Unpacking ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_mmx_punpckhbw : ClangBuiltin<"__builtin_ia32_punpckhbw">,
+  def int_x86_mmx_punpckhbw :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_punpckhwd : ClangBuiltin<"__builtin_ia32_punpckhwd">,
+  def int_x86_mmx_punpckhwd :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_punpckhdq : ClangBuiltin<"__builtin_ia32_punpckhdq">,
+  def int_x86_mmx_punpckhdq :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_punpcklbw : ClangBuiltin<"__builtin_ia32_punpcklbw">,
+  def int_x86_mmx_punpcklbw :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_punpcklwd : ClangBuiltin<"__builtin_ia32_punpcklwd">,
+  def int_x86_mmx_punpcklwd :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_punpckldq : ClangBuiltin<"__builtin_ia32_punpckldq">,
+  def int_x86_mmx_punpckldq :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
 }
 
 // Integer comparison ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_mmx_pcmpeq_b : ClangBuiltin<"__builtin_ia32_pcmpeqb">,
+  def int_x86_mmx_pcmpeq_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pcmpeq_w : ClangBuiltin<"__builtin_ia32_pcmpeqw">,
+  def int_x86_mmx_pcmpeq_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
-  def int_x86_mmx_pcmpeq_d : ClangBuiltin<"__builtin_ia32_pcmpeqd">,
+  def int_x86_mmx_pcmpeq_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem, Commutative]>;
 
-  def int_x86_mmx_pcmpgt_b : ClangBuiltin<"__builtin_ia32_pcmpgtb">,
+  def int_x86_mmx_pcmpgt_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_pcmpgt_w : ClangBuiltin<"__builtin_ia32_pcmpgtw">,
+  def int_x86_mmx_pcmpgt_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_mmx_pcmpgt_d : ClangBuiltin<"__builtin_ia32_pcmpgtd">,
+  def int_x86_mmx_pcmpgt_d :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
 }
 
 // Misc.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_mmx_maskmovq : ClangBuiltin<"__builtin_ia32_maskmovq">,
+  def int_x86_mmx_maskmovq :
               Intrinsic<[], [llvm_x86mmx_ty, llvm_x86mmx_ty, llvm_ptr_ty], []>;
 
-  def int_x86_mmx_pmovmskb : ClangBuiltin<"__builtin_ia32_pmovmskb">,
+  def int_x86_mmx_pmovmskb :
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
 
-  def int_x86_mmx_movnt_dq : ClangBuiltin<"__builtin_ia32_movntq">,
+  def int_x86_mmx_movnt_dq :
               Intrinsic<[], [llvm_ptr_ty, llvm_x86mmx_ty], []>;
 
-  def int_x86_mmx_palignr_b : ClangBuiltin<"__builtin_ia32_palignr">,
+  def int_x86_mmx_palignr_b :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty],
                             [llvm_x86mmx_ty, llvm_x86mmx_ty, llvm_i8_ty],
                             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-  def int_x86_mmx_pextr_w : ClangBuiltin<"__builtin_ia32_vec_ext_v4hi">,
+  def int_x86_mmx_pextr_w :
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                             [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-  def int_x86_mmx_pinsr_w : ClangBuiltin<"__builtin_ia32_vec_set_v4hi">,
+  def int_x86_mmx_pinsr_w :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty],
                             [llvm_x86mmx_ty, llvm_i32_ty, llvm_i32_ty],
                             [IntrNoMem, ImmArg<ArgIndex<2>>]>;

>From c6e69b041a7e6d18463f6cf684b10fd46a62c496 Mon Sep 17 00:00:00 2001
From: Carlos Seo <carlos.seo at linaro.org>
Date: Wed, 24 Jul 2024 18:14:05 -0300
Subject: [PATCH 071/259] [Flang][Docs] Update information about AArch64
 trampolines (#100391)

Commits c4b66bf and 7647174 add support for AArch64 trampolines. Updated
documentation to reflect the changes.
---
 flang/docs/InternalProcedureTrampolines.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/docs/InternalProcedureTrampolines.md b/flang/docs/InternalProcedureTrampolines.md
index ef02f1d737c82..41f6155332a47 100644
--- a/flang/docs/InternalProcedureTrampolines.md
+++ b/flang/docs/InternalProcedureTrampolines.md
@@ -239,7 +239,7 @@ automatically deallocated at the end of `host()` invocation.
 Unfortunately, this requires the program stack to be writeable and executable
 at the same time, which might be a security concern.
 
-> NOTE: LLVM's AArch64 backend supports `nest` attribute, but it does not seem to support trampoline intrinsics.
+> NOTE: LLVM's AArch64 backend supports `nest` attribute, but it requires the compiler-rt runtime selected via the `-rtlib=compiler-rt` flag.
 
 ## Alternative implementation(s)
 

>From 70c6e79e6d3e897418f3556a25e22e66ff018dc4 Mon Sep 17 00:00:00 2001
From: Daniil Kovalev <dkovalev at accesssoftek.com>
Date: Thu, 25 Jul 2024 00:24:50 +0300
Subject: [PATCH 072/259] [PAC][clang][test] Implement missing tests for some
 PAuth features (#100206)

Implement tests for the following PAuth-related features:

- driver, preprocessor and ELF codegen tests for type_info vtable
pointer discrimination #99726;

- driver, preprocessor, and ELF codegen (emitting function attributes) +
sema (emitting errors) tests for indirect gotos signing #97647;

- ELF codegen tests for ubsan type checks + auth #99590;

- ELF codegen tests for constant global init with polymorphic MI #99741;

- ELF codegen tests for C++ member function pointers auth #99576.
---
 clang/lib/Driver/ToolChains/Clang.cpp         |  3 +
 .../CodeGen/ptrauth-function-attributes.c     |  5 +-
 clang/test/CodeGen/ubsan-function.cpp         |  7 +-
 .../ptrauth-global-constant-initializers.cpp  | 77 +++++++++++--------
 .../ptrauth-member-function-pointer.cpp       | 55 +++++++------
 .../CodeGenCXX/ptrauth-type-info-vtable.cpp   | 17 +++-
 clang/test/Driver/aarch64-ptrauth.c           |  9 ++-
 clang/test/Preprocessor/ptrauth_feature.c     | 36 +++++++--
 clang/test/Sema/ptrauth-indirect-goto.c       |  1 +
 9 files changed, 137 insertions(+), 73 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index df1bb8e9ee308..752a71739eeb5 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1839,6 +1839,9 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
   Args.addOptInFlag(
       CmdArgs, options::OPT_fptrauth_vtable_pointer_type_discrimination,
       options::OPT_fno_ptrauth_vtable_pointer_type_discrimination);
+  Args.addOptInFlag(
+      CmdArgs, options::OPT_fptrauth_type_info_vtable_pointer_discrimination,
+      options::OPT_fno_ptrauth_type_info_vtable_pointer_discrimination);
   Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_init_fini,
                     options::OPT_fno_ptrauth_init_fini);
   Args.addOptInFlag(
diff --git a/clang/test/CodeGen/ptrauth-function-attributes.c b/clang/test/CodeGen/ptrauth-function-attributes.c
index 7f93ccc7c4bce..6a09cd37bf485 100644
--- a/clang/test/CodeGen/ptrauth-function-attributes.c
+++ b/clang/test/CodeGen/ptrauth-function-attributes.c
@@ -4,8 +4,9 @@
 // RUN: %clang_cc1 -triple arm64-apple-ios  -fptrauth-calls   -emit-llvm %s  -o - | FileCheck %s --check-prefixes=ALL,CALLS
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls  -emit-llvm %s  -o - | FileCheck %s --check-prefixes=ALL,CALLS
 
-// RUN: %clang_cc1 -triple arm64-apple-ios  -fptrauth-indirect-gotos -emit-llvm %s -o - | FileCheck %s --check-prefixes=ALL,GOTOS
-// RUN: %clang_cc1 -triple arm64e-apple-ios -fptrauth-indirect-gotos -emit-llvm %s -o - | FileCheck %s --check-prefixes=ALL,GOTOS
+// RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-indirect-gotos -emit-llvm %s -o - | FileCheck %s --check-prefixes=ALL,GOTOS
+// RUN: %clang_cc1 -triple arm64e-apple-ios  -fptrauth-indirect-gotos -emit-llvm %s -o - | FileCheck %s --check-prefixes=ALL,GOTOS
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-indirect-gotos -emit-llvm %s -o - | FileCheck %s --check-prefixes=ALL,GOTOS
 
 // ALL: define {{(dso_local )?}}void @test() #0
 void test() {
diff --git a/clang/test/CodeGen/ubsan-function.cpp b/clang/test/CodeGen/ubsan-function.cpp
index 8478f05a10b78..76d4237383f83 100644
--- a/clang/test/CodeGen/ubsan-function.cpp
+++ b/clang/test/CodeGen/ubsan-function.cpp
@@ -4,7 +4,8 @@
 // RUN: %clang_cc1 -triple aarch64_be-linux-gnu -emit-llvm -o - %s -fsanitize=function -fno-sanitize-recover=all | FileCheck %s --check-prefixes=CHECK,GNU,64
 // RUN: %clang_cc1 -triple arm-none-eabi -emit-llvm -o - %s -fsanitize=function -fno-sanitize-recover=all | FileCheck %s --check-prefixes=CHECK,ARM,GNU,32
 
-// RUN: %clang_cc1 -triple arm64e-apple-ios -emit-llvm -o - %s -fsanitize=function -fno-sanitize-recover=all -fptrauth-calls | FileCheck %s --check-prefixes=CHECK,GNU,64,64e
+// RUN: %clang_cc1 -triple arm64e-apple-ios  -emit-llvm -o - %s -fsanitize=function -fno-sanitize-recover=all -fptrauth-calls | FileCheck %s --check-prefixes=CHECK,GNU,64,AUTH
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -emit-llvm -o - %s -fsanitize=function -fno-sanitize-recover=all -fptrauth-calls | FileCheck %s --check-prefixes=CHECK,GNU,64,AUTH
 
 // GNU:  define{{.*}} void @_Z3funv() #0 !func_sanitize ![[FUNCSAN:.*]] {
 // MSVC: define{{.*}} void @"?fun@@YAXXZ"() #0 !func_sanitize ![[FUNCSAN:.*]] {
@@ -15,8 +16,8 @@ void fun() {}
 // ARM:   ptrtoint ptr {{.*}} to i32, !nosanitize !5
 // ARM:   and i32 {{.*}}, -2, !nosanitize !5
 // ARM:   inttoptr i32 {{.*}} to ptr, !nosanitize !5
-// 64e:   %[[STRIPPED:.*]] = ptrtoint ptr {{.*}} to i64, !nosanitize
-// 64e:   call i64 @llvm.ptrauth.auth(i64 %[[STRIPPED]], i32 0, i64 0), !nosanitize
+// AUTH:  %[[STRIPPED:.*]] = ptrtoint ptr {{.*}} to i64, !nosanitize
+// AUTH:  call i64 @llvm.ptrauth.auth(i64 %[[STRIPPED]], i32 0, i64 0), !nosanitize
 // CHECK: getelementptr <{ i32, i32 }>, ptr {{.*}}, i32 -1, i32 0, !nosanitize
 // CHECK: load i32, ptr {{.*}}, align {{.*}}, !nosanitize
 // CHECK: icmp eq i32 {{.*}}, -1056584962, !nosanitize
diff --git a/clang/test/CodeGenCXX/ptrauth-global-constant-initializers.cpp b/clang/test/CodeGenCXX/ptrauth-global-constant-initializers.cpp
index f0c3ea83d8958..9ce9def6156ef 100644
--- a/clang/test/CodeGenCXX/ptrauth-global-constant-initializers.cpp
+++ b/clang/test/CodeGenCXX/ptrauth-global-constant-initializers.cpp
@@ -1,4 +1,7 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fno-rtti -fptrauth-vtable-pointer-type-discrimination -fptrauth-vtable-pointer-address-discrimination -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -fno-rtti -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,DARWIN
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fno-rtti -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,ELF
 
 // CHECK: %struct.Base1 = type { ptr }
 // CHECK: %struct.Base2 = type { ptr }
@@ -6,27 +9,27 @@
 // CHECK: %struct.Derived2 = type { %struct.Base2, %struct.Base1 }
 // CHECK: %struct.Derived3 = type { %struct.Base1, %struct.Base2 }
 
-// CHECK: @_ZTV5Base1 = linkonce_odr unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC:38871]], ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV5Base1, i32 0, i32 0, i32 2))] }, align 8
-// CHECK: @g_b1 = global %struct.Base1 { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV5Base1, i32 0, i32 0, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC:6511]], ptr @g_b1) }, align 8
-// CHECK: @_ZTV5Base2 = linkonce_odr unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC:27651]], ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV5Base2, i32 0, i32 0, i32 2))] }, align 8
-// CHECK: @g_b2 = global %struct.Base2 { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV5Base2, i32 0, i32 0, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC:63631]], ptr @g_b2) }, align 8
-// CHECK: @_ZTV8Derived1 = linkonce_odr unnamed_addr constant { [5 x ptr], [3 x ptr] } { [5 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN8Derived11cEv, i32 0, i64 [[DERIVED1_C_DISC:54092]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN8Derived11dEv, i32 0, i64 [[DERIVED1_D_DISC:37391]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 4))], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 1, i32 2))] }, align 8
-// CHECK: @g_d1 = global { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 24) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC]], ptr @g_d1), ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 1, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC]], ptr getelementptr inbounds ({ ptr, ptr }, ptr @g_d1, i32 0, i32 1)) }, align 8
-// CHECK: @_ZTV8Derived2 = linkonce_odr unnamed_addr constant { [5 x ptr], [3 x ptr] } { [5 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN8Derived21cEv, i32 0, i64 [[DERIVED2_C_DISC:15537]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN8Derived21eEv, i32 0, i64 209, ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 4))], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 1, i32 2))] }, align 8
-// CHECK: @g_d2 = global { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 24) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC]], ptr @g_d2), ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 1, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC]], ptr getelementptr inbounds ({ ptr, ptr }, ptr @g_d2, i32 0, i32 1)) }, align 8
-// CHECK: @_ZTV8Derived3 = linkonce_odr unnamed_addr constant { [4 x ptr], [3 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN8Derived31iEv, i32 0, i64 [[DERIVED3_I_DISC:19084]], ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 0, i32 3))], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 1, i32 2))] }, align 8
-// CHECK: @g_d3 = global { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 0, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC]], ptr @g_d3), ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 1, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC]], ptr getelementptr inbounds ({ ptr, ptr }, ptr @g_d3, i32 0, i32 1)) }, align 8
-// CHECK: @g_vb1 = global %struct.VirtualBase1 zeroinitializer, align 8
-// CHECK: @g_vb2 = global %struct.VirtualBase2 zeroinitializer, align 8
-// CHECK: @g_d4 = global %struct.Derived4 zeroinitializer, align 8
-// CHECK: @_ZTV12VirtualBase1 = linkonce_odr unnamed_addr constant { [6 x ptr] } { [6 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 4)), ptr ptrauth (ptr @_ZN12VirtualBase11fEv, i32 0, i64 [[VIRTUALBASE1_F_DISC:7987]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 5))] }, align 8
-// CHECK: @_ZTT12VirtualBase1 = linkonce_odr unnamed_addr constant [2 x ptr] [ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 4), i32 2)], align 8
-// CHECK: @_ZTV12VirtualBase2 = linkonce_odr unnamed_addr constant { [5 x ptr], [4 x ptr] } { [5 x ptr] [ptr inttoptr (i64 8 to ptr), ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN12VirtualBase21gEv, i32 0, i64 [[VIRTUALBASE2_G_DISC:51224]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 0, i32 4))], [4 x ptr] [ptr null, ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 1, i32 3))] }, align 8
-// CHECK: @_ZTT12VirtualBase2 = linkonce_odr unnamed_addr constant [2 x ptr] [ptr ptrauth (ptr getelementptr inbounds inrange(-24, 16) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 0, i32 3), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 8) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 1, i32 3), i32 2)], align 8
-// CHECK: @_ZTV8Derived4 = linkonce_odr unnamed_addr constant { [7 x ptr], [5 x ptr] } { [7 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 4)), ptr ptrauth (ptr @_ZN12VirtualBase11fEv, i32 0, i64 [[VIRTUALBASE1_F_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 5)), ptr ptrauth (ptr @_ZN8Derived41hEv, i32 0, i64 [[DERIVED4_H_DISC:31844]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 6))], [5 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 1, i32 3)), ptr ptrauth (ptr @_ZN12VirtualBase21gEv, i32 0, i64 [[VIRTUALBASE2_G_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 1, i32 4))] }, align 8
-// CHECK: @_ZTT8Derived4 = linkonce_odr unnamed_addr constant [7 x ptr] [ptr ptrauth (ptr getelementptr inbounds inrange(-32, 24) ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 16) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 0, i32 3), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 8) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 1, i32 3), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 24) ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 16) ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 1, i32 3), i32 2)], align 8
-// CHECK: @_ZTC8Derived40_12VirtualBase1 = linkonce_odr unnamed_addr constant { [6 x ptr] } { [6 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 4)), ptr ptrauth (ptr @_ZN12VirtualBase11fEv, i32 0, i64 [[VIRTUALBASE1_F_DISC]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 5))] }, align 8
-// CHECK: @_ZTC8Derived48_12VirtualBase2 = linkonce_odr unnamed_addr constant { [5 x ptr], [4 x ptr] } { [5 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN12VirtualBase21gEv, i32 0, i64 [[VIRTUALBASE2_G_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 0, i32 4))], [4 x ptr] [ptr null, ptr inttoptr (i64 8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 1, i32 3))] }, align 8
+// CHECK: @_ZTV5Base1 = linkonce_odr unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC:38871]], ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV5Base1, i32 0, i32 0, i32 2))] },{{.*}} align 8
+// CHECK: @g_b1 = global %struct.Base1 { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV5Base1, i32 0, i32 0, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC:6511]], ptr @g_b1) },{{.*}} align 8
+// CHECK: @_ZTV5Base2 = linkonce_odr unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC:27651]], ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV5Base2, i32 0, i32 0, i32 2))] },{{.*}} align 8
+// CHECK: @g_b2 = global %struct.Base2 { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV5Base2, i32 0, i32 0, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC:63631]], ptr @g_b2) },{{.*}} align 8
+// CHECK: @_ZTV8Derived1 = linkonce_odr unnamed_addr constant { [5 x ptr], [3 x ptr] } { [5 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN8Derived11cEv, i32 0, i64 [[DERIVED1_C_DISC:54092]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN8Derived11dEv, i32 0, i64 [[DERIVED1_D_DISC:37391]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 4))], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 1, i32 2))] },{{.*}} align 8
+// CHECK: @g_d1 = global { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 24) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC]], ptr @g_d1), ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 1, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC]], ptr getelementptr inbounds ({ ptr, ptr }, ptr @g_d1, i32 0, i32 1)) },{{.*}} align 8
+// CHECK: @_ZTV8Derived2 = linkonce_odr unnamed_addr constant { [5 x ptr], [3 x ptr] } { [5 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN8Derived21cEv, i32 0, i64 [[DERIVED2_C_DISC:15537]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN8Derived21eEv, i32 0, i64 209, ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 4))], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 1, i32 2))] },{{.*}} align 8
+// CHECK: @g_d2 = global { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 24) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC]], ptr @g_d2), ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 1, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC]], ptr getelementptr inbounds ({ ptr, ptr }, ptr @g_d2, i32 0, i32 1)) },{{.*}} align 8
+// CHECK: @_ZTV8Derived3 = linkonce_odr unnamed_addr constant { [4 x ptr], [3 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN8Derived31iEv, i32 0, i64 [[DERIVED3_I_DISC:19084]], ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 0, i32 3))], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 1, i32 2))] },{{.*}} align 8
+// CHECK: @g_d3 = global { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 0, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC]], ptr @g_d3), ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 1, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC]], ptr getelementptr inbounds ({ ptr, ptr }, ptr @g_d3, i32 0, i32 1)) },{{.*}} align 8
+// CHECK: @g_vb1 = global %struct.VirtualBase1 zeroinitializer,{{.*}} align 8
+// CHECK: @g_vb2 = global %struct.VirtualBase2 zeroinitializer,{{.*}} align 8
+// CHECK: @g_d4 = global %struct.Derived4 zeroinitializer,{{.*}} align 8
+// CHECK: @_ZTV12VirtualBase1 = linkonce_odr unnamed_addr constant { [6 x ptr] } { [6 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 4)), ptr ptrauth (ptr @_ZN12VirtualBase11fEv, i32 0, i64 [[VIRTUALBASE1_F_DISC:7987]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 5))] },{{.*}} align 8
+// CHECK: @_ZTT12VirtualBase1 = linkonce_odr unnamed_addr constant [2 x ptr] [ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 4), i32 2)],{{.*}} align 8
+// CHECK: @_ZTV12VirtualBase2 = linkonce_odr unnamed_addr constant { [5 x ptr], [4 x ptr] } { [5 x ptr] [ptr inttoptr (i64 8 to ptr), ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN12VirtualBase21gEv, i32 0, i64 [[VIRTUALBASE2_G_DISC:51224]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 0, i32 4))], [4 x ptr] [ptr null, ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 1, i32 3))] },{{.*}} align 8
+// CHECK: @_ZTT12VirtualBase2 = linkonce_odr unnamed_addr constant [2 x ptr] [ptr ptrauth (ptr getelementptr inbounds inrange(-24, 16) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 0, i32 3), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 8) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 1, i32 3), i32 2)],{{.*}} align 8
+// CHECK: @_ZTV8Derived4 = linkonce_odr unnamed_addr constant { [7 x ptr], [5 x ptr] } { [7 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 4)), ptr ptrauth (ptr @_ZN12VirtualBase11fEv, i32 0, i64 [[VIRTUALBASE1_F_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 5)), ptr ptrauth (ptr @_ZN8Derived41hEv, i32 0, i64 [[DERIVED4_H_DISC:31844]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 6))], [5 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 1, i32 3)), ptr ptrauth (ptr @_ZN12VirtualBase21gEv, i32 0, i64 [[VIRTUALBASE2_G_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 1, i32 4))] },{{.*}} align 8
+// CHECK: @_ZTT8Derived4 = linkonce_odr unnamed_addr constant [7 x ptr] [ptr ptrauth (ptr getelementptr inbounds inrange(-32, 24) ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 16) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 0, i32 3), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 8) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 1, i32 3), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 24) ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 16) ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 1, i32 3), i32 2)],{{.*}} align 8
+// CHECK: @_ZTC8Derived40_12VirtualBase1 = linkonce_odr unnamed_addr constant { [6 x ptr] } { [6 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 4)), ptr ptrauth (ptr @_ZN12VirtualBase11fEv, i32 0, i64 [[VIRTUALBASE1_F_DISC]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 5))] },{{.*}} align 8
+// CHECK: @_ZTC8Derived48_12VirtualBase2 = linkonce_odr unnamed_addr constant { [5 x ptr], [4 x ptr] } { [5 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN12VirtualBase21gEv, i32 0, i64 [[VIRTUALBASE2_G_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 0, i32 4))], [4 x ptr] [ptr null, ptr inttoptr (i64 8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 1, i32 3))] },{{.*}} align 8
 
 struct Base1 { virtual void a() {} };
 struct Base2 { virtual void b() {} };
@@ -73,20 +76,24 @@ struct Derived5 : VirtualBase2, VirtualBase1 {
   virtual void h() {}
 };
 
-// CHECK-LABEL: define {{.*}} ptr @_ZN12VirtualBase1C1Ev
+// DARWIN-LABEL: define {{.*}} ptr @_ZN12VirtualBase1C1Ev
+// ELF-LABEL:    define {{.*}} void @_ZN12VirtualBase1C1Ev
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
 
-// CHECK-LABEL: define {{.*}} ptr @_ZN12VirtualBase2C1Ev
+// DARWIN-LABEL: define {{.*}} ptr @_ZN12VirtualBase2C1Ev
+// ELF-LABEL:    define {{.*}} void @_ZN12VirtualBase2C1Ev
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
 
-// CHECK-LABEL: define {{.*}} ptr @_ZN8Derived4C1Ev
+// DARWIN-LABEL: define {{.*}} ptr @_ZN8Derived4C1Ev
+// ELF-LABEL:    define {{.*}} void @_ZN8Derived4C1Ev
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
 
-// CHECK-LABEL: define {{.*}} ptr @_ZN8Derived5C1Ev
+// DARWIN-LABEL: define {{.*}} ptr @_ZN8Derived5C1Ev
+// ELF-LABEL:    define {{.*}} void @_ZN8Derived5C1Ev
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
@@ -155,7 +162,7 @@ extern "C" void cross_check_vtables(Base1 *b1,
   d5->h();
 }
 
-// CHECK-LABEL: define void @cross_check_vtables(
+// CHECK-LABEL: define{{.*}} void @cross_check_vtables(
 // CHECK: "; b1->a()",
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
@@ -214,21 +221,25 @@ extern "C" void cross_check_vtables(Base1 *b1,
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[DERIVED4_H_DISC]])
 
-// CHECK-LABEL: define {{.*}} ptr @_ZN5Base1C2Ev
+// DARWIN-LABEL: define {{.*}} ptr @_ZN5Base1C2Ev
+// ELF-LABEL:    define {{.*}} void @_ZN5Base1C2Ev
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
 
-// CHECK-LABEL: define {{.*}} ptr @_ZN5Base2C2Ev
+// DARWIN-LABEL: define {{.*}} ptr @_ZN5Base2C2Ev
+// ELF-LABEL:    define {{.*}} void @_ZN5Base2C2Ev
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
 
-// CHECK-LABEL: define {{.*}} ptr @_ZN8Derived1C2Ev
+// DARWIN-LABEL: define {{.*}} ptr @_ZN8Derived1C2Ev
+// ELF-LABEL:    define {{.*}} void @_ZN8Derived1C2Ev
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
 
-// CHECK-LABEL: define {{.*}} ptr @_ZN8Derived2C2Ev
+// DARWIN-LABEL: define {{.*}} ptr @_ZN8Derived2C2Ev
+// ELF-LABEL:    define {{.*}} void @_ZN8Derived2C2Ev
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
 
-// CHECK-LABEL: define {{.*}} ptr @_ZN8Derived3C2Ev
+// DARWIN-LABEL: define {{.*}} ptr @_ZN8Derived3C2Ev
+// ELF-LABEL:    define {{.*}} void @_ZN8Derived3C2Ev
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
 // CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
-
diff --git a/clang/test/CodeGenCXX/ptrauth-member-function-pointer.cpp b/clang/test/CodeGenCXX/ptrauth-member-function-pointer.cpp
index 5e84e3e7bc5e9..0a9ac3fa510f5 100644
--- a/clang/test/CodeGenCXX/ptrauth-member-function-pointer.cpp
+++ b/clang/test/CodeGenCXX/ptrauth-member-function-pointer.cpp
@@ -1,8 +1,14 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -o - %s | FileCheck -check-prefixes=CHECK,NODEBUG %s
-// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -debug-info-kind=limited -o - %s | FileCheck -check-prefixes=CHECK %s
-// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 1 -o - %s | FileCheck %s -check-prefix=STACK-PROT
-// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 2 -o - %s | FileCheck %s -check-prefix=STACK-PROT
-// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 3 -o - %s | FileCheck %s -check-prefix=STACK-PROT
+// RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -o - %s | FileCheck -check-prefixes=CHECK,NODEBUG,DARWIN %s
+// RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -debug-info-kind=limited -o - %s | FileCheck -check-prefixes=CHECK,DARWIN %s
+// RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 1 -o - %s | FileCheck %s -check-prefix=STACK-PROT
+// RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 2 -o - %s | FileCheck %s -check-prefix=STACK-PROT
+// RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 3 -o - %s | FileCheck %s -check-prefix=STACK-PROT
+
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -o - %s | FileCheck -check-prefixes=CHECK,NODEBUG,ELF %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -debug-info-kind=limited -o - %s | FileCheck -check-prefixes=CHECK,ELF %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 1 -o - %s | FileCheck %s -check-prefix=STACK-PROT
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 2 -o - %s | FileCheck %s -check-prefix=STACK-PROT
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 3 -o - %s | FileCheck %s -check-prefix=STACK-PROT
 
 
 // CHECK: @gmethod0 = global { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base011nonvirtual0Ev, i32 0, i64 [[TYPEDISC1:35591]]) to i64), i64 0 }, align 8
@@ -78,9 +84,9 @@ struct Class0 {
   MethodTy1 m0;
 };
 
-// CHECK: define void @_ZN5Base08virtual1Ev(
+// CHECK: define{{.*}} void @_ZN5Base08virtual1Ev(
 
-// CHECK: define void @_Z5test0v()
+// CHECK: define{{.*}} void @_Z5test0v()
 // CHECK: %[[METHOD0:.*]] = alloca { i64, i64 }, align 8
 // CHECK-NEXT: %[[VARMETHOD1:.*]] = alloca { i64, i64 }, align 8
 // CHECK-NEXT: %[[METHOD2:.*]] = alloca { i64, i64 }, align 8
@@ -246,7 +252,7 @@ void test0() {
   method7 = &Derived1::virtual1;
 }
 
-// CHECK: define void @_Z5test1P5Base0MS_FvvE(ptr noundef %[[A0:.*]], [2 x i64] %[[A1_COERCE:.*]])
+// CHECK: define{{.*}} void @_Z5test1P5Base0MS_FvvE(ptr noundef %[[A0:.*]], [2 x i64] %[[A1_COERCE:.*]])
 // CHECK: %[[A1:.*]] = alloca { i64, i64 }, align 8
 // CHECK: %[[A0_ADDR:.*]] = alloca ptr, align 8
 // CHECK: %[[A1_ADDR:.*]] = alloca { i64, i64 }, align 8
@@ -264,15 +270,16 @@ void test0() {
 // CHECK: %[[MEMPTR_ISVIRTUAL:.*]] = icmp ne i64 %[[V5]], 0
 // CHECK: br i1 %[[MEMPTR_ISVIRTUAL]]
 
-// CHECK: %[[VTABLE:.*]] = load ptr, ptr %[[V4]], align 8
-// CHECK: %[[V7:.*]] = ptrtoint ptr %[[VTABLE]] to i64
-// CHECK: %[[V8:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[V7]], i32 2, i64 0)
-// CHECK: %[[V9:.*]] = inttoptr i64 %[[V8]] to ptr
-// CHECK: %[[V10:.*]] = trunc i64 %[[MEMPTR_PTR]] to i32
-// CHECK: %[[V11:.*]] = zext i32 %[[V10]] to i64
-// CHECK: %[[V12:.*]] = getelementptr i8, ptr %[[V9]], i64 %[[V11]]
-// CHECK: %[[MEMPTR_VIRTUALFN:.*]] = load ptr, ptr %[[V12]], align 8
-// CHECK: br
+// CHECK:  %[[VTABLE:.*]] = load ptr, ptr %[[V4]], align 8
+// CHECK:  %[[V7:.*]] = ptrtoint ptr %[[VTABLE]] to i64
+// CHECK:  %[[V8:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[V7]], i32 2, i64 0)
+// CHECK:  %[[V9:.*]] = inttoptr i64 %[[V8]] to ptr
+// DARWIN: %[[V10:.*]] = trunc i64 %[[MEMPTR_PTR]] to i32
+// DARWIN: %[[V11:.*]] = zext i32 %[[V10]] to i64
+// DARWIN: %[[V12:.*]] = getelementptr i8, ptr %[[V9]], i64 %[[V11]]
+// ELF:    %[[V12:.*]] = getelementptr i8, ptr %[[V9]], i64 %[[MEMPTR_PTR]]
+// CHECK:  %[[MEMPTR_VIRTUALFN:.*]] = load ptr, ptr %[[V12]], align 8
+// CHECK:  br
 
 // CHECK: %[[MEMPTR_NONVIRTUALFN:.*]] = inttoptr i64 %[[MEMPTR_PTR]] to ptr
 // CHECK: br
@@ -286,7 +293,7 @@ void test1(Base0 *a0, MethodTy0 a1) {
   (a0->*a1)();
 }
 
-// CHECK: define void @_Z15testConversion0M5Base0FvvEM8Derived0FvvE([2 x i64] %[[METHOD0_COERCE:.*]], [2 x i64] %[[METHOD1_COERCE:.*]])
+// CHECK: define{{.*}} void @_Z15testConversion0M5Base0FvvEM8Derived0FvvE([2 x i64] %[[METHOD0_COERCE:.*]], [2 x i64] %[[METHOD1_COERCE:.*]])
 // CHECK: %[[METHOD0:.*]] = alloca { i64, i64 }, align 8
 // CHECK: %[[METHOD1:.*]] = alloca { i64, i64 }, align 8
 // CHECK: %[[METHOD0_ADDR:.*]] = alloca { i64, i64 }, align 8
@@ -326,21 +333,21 @@ void testConversion0(MethodTy0 method0, MethodTy1 method1) {
   method1 = method0;
 }
 
-// CHECK: define void @_Z15testConversion1M5Base0FvvE(
+// CHECK: define{{.*}} void @_Z15testConversion1M5Base0FvvE(
 // CHECK: call i64 @llvm.ptrauth.resign(i64 %{{.*}}, i32 0, i64 [[TYPEDISC0]], i32 0, i64 [[TYPEDISC1]])
 
 void testConversion1(MethodTy0 method0) {
   MethodTy1 method1 = reinterpret_cast<MethodTy1>(method0);
 }
 
-// CHECK: define void @_Z15testConversion2M8Derived0FvvE(
+// CHECK: define{{.*}} void @_Z15testConversion2M8Derived0FvvE(
 // CHECK: call i64 @llvm.ptrauth.resign(i64 %{{.*}}, i32 0, i64 [[TYPEDISC1]], i32 0, i64 [[TYPEDISC0]])
 
 void testConversion2(MethodTy1 method1) {
   MethodTy0 method0 = static_cast<MethodTy0>(method1);
 }
 
-// CHECK: define void @_Z15testConversion3M8Derived0FvvE(
+// CHECK: define{{.*}} void @_Z15testConversion3M8Derived0FvvE(
 // CHECK: call i64 @llvm.ptrauth.resign(i64 %{{.*}}, i32 0, i64 [[TYPEDISC1]], i32 0, i64 [[TYPEDISC0]])
 
 void testConversion3(MethodTy1 method1) {
@@ -350,7 +357,7 @@ void testConversion3(MethodTy1 method1) {
 // No need to call @llvm.ptrauth.resign if the source member function
 // pointer is a constant.
 
-// CHECK: define void @_Z15testConversion4v(
+// CHECK: define{{.*}} void @_Z15testConversion4v(
 // CHECK: %[[METHOD0:.*]] = alloca { i64, i64 }, align 8
 // CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual1Ev_vfpthunk_, i32 0, i64 [[TYPEDISC0]]) to i64), i64 0 }, ptr %[[METHOD0]], align 8
 // CHECK: ret void
@@ -396,7 +403,7 @@ MethodTy1 gmethod0 = reinterpret_cast<MethodTy1>(&Base0::nonvirtual0);
 MethodTy0 gmethod1 = reinterpret_cast<MethodTy0>(&Derived0::nonvirtual5);
 MethodTy0 gmethod2 = reinterpret_cast<MethodTy0>(&Derived0::virtual1);
 
-// CHECK-LABEL: define void @_Z13testArrayInitv()
+// CHECK-LABEL: define{{.*}} void @_Z13testArrayInitv()
 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %p0, ptr align 8 @__const._Z13testArrayInitv.p0, i64 16, i1 false)
 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %p1, ptr align 8 @__const._Z13testArrayInitv.p1, i64 16, i1 false)
 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %c0, ptr align 8 @__const._Z13testArrayInitv.c0, i64 16, i1 false)
@@ -424,7 +431,7 @@ void testArrayInit() {
 // STACK-PROT-NOT: sspreq
 // STACK-PROT-NEXT: attributes
 
-// CHECK: define void @_Z15testConvertNullv(
+// CHECK: define{{.*}} void @_Z15testConvertNullv(
 // CHECK: %[[T:.*]] = alloca { i64, i64 },
 // store { i64, i64 } zeroinitializer, { i64, i64 }* %[[T]],
 
diff --git a/clang/test/CodeGenCXX/ptrauth-type-info-vtable.cpp b/clang/test/CodeGenCXX/ptrauth-type-info-vtable.cpp
index d5f69e0485140..174aeda89d175 100644
--- a/clang/test/CodeGenCXX/ptrauth-type-info-vtable.cpp
+++ b/clang/test/CodeGenCXX/ptrauth-type-info-vtable.cpp
@@ -4,6 +4,12 @@
 // RUN:   -fptrauth-vtable-pointer-address-discrimination \
 // RUN:   %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NODISC
 
+// RUN: %clang_cc1 -DENABLE_TID=0 -I%S -std=c++11 -triple=aarch64-linux-gnu \
+// RUN:   -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NODISC
+
 // RUN: %clang_cc1 -DENABLE_TID=1 -I%S -std=c++11 -triple=arm64e-apple-darwin \
 // RUN:   -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -fptrauth-vtable-pointer-type-discrimination \
@@ -11,6 +17,13 @@
 // RUN:   -fptrauth-type-info-vtable-pointer-discrimination \
 // RUN:   %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,DISC
 
+// RUN: %clang_cc1 -DENABLE_TID=1 -I%S -std=c++11 -triple=aarch64-linux-gnu \
+// RUN:   -fptrauth-calls -fptrauth-intrinsics \
+// RUN:   -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   -fptrauth-type-info-vtable-pointer-discrimination \
+// RUN:   %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,DISC
+
 // copied from typeinfo
 namespace std {
 
@@ -64,7 +77,7 @@ TestStruct::~TestStruct(){}
 extern "C" void test_vtable(std::type_info* t) {
   t->test_method();
 }
-// NODISC: define void @test_vtable(ptr noundef %t)
+// NODISC: define{{.*}} void @test_vtable(ptr noundef %t)
 // NODISC: [[T_ADDR:%.*]] = alloca ptr, align 8
 // NODISC: store ptr %t, ptr [[T_ADDR]], align 8
 // NODISC: [[T:%.*]] = load ptr, ptr [[T_ADDR]], align 8
@@ -72,7 +85,7 @@ extern "C" void test_vtable(std::type_info* t) {
 // NODISC: [[CAST_VPTR:%.*]] = ptrtoint ptr [[VPTR]] to i64
 // NODISC: [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[CAST_VPTR]], i32 2, i64 0)
 
-// DISC: define void @test_vtable(ptr noundef %t)
+// DISC: define{{.*}} void @test_vtable(ptr noundef %t)
 // DISC: [[T_ADDR:%.*]] = alloca ptr, align 8
 // DISC: store ptr %t, ptr [[T_ADDR]], align 8
 // DISC: [[T:%.*]] = load ptr, ptr [[T_ADDR]], align 8
diff --git a/clang/test/Driver/aarch64-ptrauth.c b/clang/test/Driver/aarch64-ptrauth.c
index d13930e8f4b37..20899b8920cf9 100644
--- a/clang/test/Driver/aarch64-ptrauth.c
+++ b/clang/test/Driver/aarch64-ptrauth.c
@@ -11,9 +11,11 @@
 // RUN:   -fno-ptrauth-auth-traps -fptrauth-auth-traps \
 // RUN:   -fno-ptrauth-vtable-pointer-address-discrimination -fptrauth-vtable-pointer-address-discrimination \
 // RUN:   -fno-ptrauth-vtable-pointer-type-discrimination -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fno-ptrauth-type-info-vtable-pointer-discrimination -fptrauth-type-info-vtable-pointer-discrimination \
 // RUN:   -fno-ptrauth-init-fini -fptrauth-init-fini \
+// RUN:   -fno-ptrauth-indirect-gotos -fptrauth-indirect-gotos \
 // RUN:   %s 2>&1 | FileCheck %s --check-prefix=ALL
-// ALL: "-cc1"{{.*}} "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-init-fini"
+// ALL: "-cc1"{{.*}} "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-type-info-vtable-pointer-discrimination" "-fptrauth-init-fini" "-fptrauth-indirect-gotos"
 
 // RUN: %clang -### -c --target=aarch64-linux -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI1
 // RUN: %clang -### -c --target=aarch64-linux-pauthtest %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI1
@@ -34,13 +36,16 @@
 
 // RUN: not %clang -### -c --target=x86_64 -fptrauth-intrinsics -fptrauth-calls -fptrauth-returns -fptrauth-auth-traps \
 // RUN:   -fptrauth-vtable-pointer-address-discrimination -fptrauth-vtable-pointer-type-discrimination \
-// RUN:   -fptrauth-init-fini %s 2>&1 | FileCheck %s --check-prefix=ERR1
+// RUN:   -fptrauth-type-info-vtable-pointer-discrimination -fptrauth-indirect-gotos -fptrauth-init-fini %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR1
 // ERR1:      error: unsupported option '-fptrauth-intrinsics' for target '{{.*}}'
 // ERR1-NEXT: error: unsupported option '-fptrauth-calls' for target '{{.*}}'
 // ERR1-NEXT: error: unsupported option '-fptrauth-returns' for target '{{.*}}'
 // ERR1-NEXT: error: unsupported option '-fptrauth-auth-traps' for target '{{.*}}'
 // ERR1-NEXT: error: unsupported option '-fptrauth-vtable-pointer-address-discrimination' for target '{{.*}}'
 // ERR1-NEXT: error: unsupported option '-fptrauth-vtable-pointer-type-discrimination' for target '{{.*}}'
+// ERR1-NEXT: error: unsupported option '-fptrauth-type-info-vtable-pointer-discrimination' for target '{{.*}}'
+// ERR1-NEXT: error: unsupported option '-fptrauth-indirect-gotos' for target '{{.*}}'
 // ERR1-NEXT: error: unsupported option '-fptrauth-init-fini' for target '{{.*}}'
 
 //// Only support PAuth ABI for Linux as for now.
diff --git a/clang/test/Preprocessor/ptrauth_feature.c b/clang/test/Preprocessor/ptrauth_feature.c
index 1330ad10b4b47..14059f827b94c 100644
--- a/clang/test/Preprocessor/ptrauth_feature.c
+++ b/clang/test/Preprocessor/ptrauth_feature.c
@@ -2,25 +2,31 @@
 //// For example, -fptrauth-init-fini will not affect codegen without -fptrauth-calls, but the preprocessor feature would be set anyway.
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-intrinsics | \
-// RUN:   FileCheck %s --check-prefixes=INTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOFUNC,NOINITFINI
+// RUN:   FileCheck %s --check-prefixes=INTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOGOTOS
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-calls | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,CALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOFUNC,NOINITFINI
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,CALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOGOTOS
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-returns | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,RETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOFUNC,NOINITFINI
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,RETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOGOTOS
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-vtable-pointer-address-discrimination | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,VPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOFUNC,NOINITFINI
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,VPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOGOTOS
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-vtable-pointer-type-discrimination | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,VPTR_TYPE_DISCR,NOFUNC,NOINITFINI
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,VPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOGOTOS
+
+// RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-type-info-vtable-pointer-discrimination | \
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,TYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOGOTOS
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-function-pointer-type-discrimination | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,FUNC,NOINITFINI
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,FUNC,NOINITFINI,NOGOTOS
 
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-init-fini | \
-// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOFUNC,INITFINI
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,INITFINI,NOGOTOS
+
+// RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-indirect-gotos | \
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,GOTOS
 
 #if __has_feature(ptrauth_intrinsics)
 // INTRIN: has_ptrauth_intrinsics
@@ -71,6 +77,14 @@ void has_ptrauth_vtable_pointer_type_discrimination() {}
 void no_ptrauth_vtable_pointer_type_discrimination() {}
 #endif
 
+#if __has_feature(ptrauth_type_info_vtable_pointer_discrimination)
+// TYPE_INFO_DISCR: has_ptrauth_type_info_vtable_pointer_discrimination
+void has_ptrauth_type_info_vtable_pointer_discrimination() {}
+#else
+// NOTYPE_INFO_DISCR: no_ptrauth_type_info_vtable_pointer_discrimination
+void no_ptrauth_type_info_vtable_pointer_discrimination() {}
+#endif
+
 #if __has_feature(ptrauth_function_pointer_type_discrimination)
 // FUNC: has_ptrauth_function_pointer_type_discrimination
 void has_ptrauth_function_pointer_type_discrimination() {}
@@ -86,3 +100,11 @@ void has_ptrauth_init_fini() {}
 // NOINITFINI: no_ptrauth_init_fini
 void no_ptrauth_init_fini() {}
 #endif
+
+#if __has_feature(ptrauth_indirect_gotos)
+// GOTOS: has_ptrauth_indirect_gotos
+void has_ptrauth_indirect_gotos() {}
+#else
+// NOGOTOS: no_ptrauth_indirect_gotos
+void no_ptrauth_indirect_gotos() {}
+#endif
diff --git a/clang/test/Sema/ptrauth-indirect-goto.c b/clang/test/Sema/ptrauth-indirect-goto.c
index 47bc76738d23b..7304f5c30a117 100644
--- a/clang/test/Sema/ptrauth-indirect-goto.c
+++ b/clang/test/Sema/ptrauth-indirect-goto.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -triple arm64e-apple-darwin -fsyntax-only -verify %s -fptrauth-indirect-gotos
+// RUN: %clang_cc1 -triple aarch64-linux-gnu   -fsyntax-only -verify %s -fptrauth-indirect-gotos
 
 int f() {
   static void *addrs[] = { &&l1, &&l2 };

>From 74e14605d73e6286651d4b47ba5e04858cec7ebd Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie at gmail.com>
Date: Wed, 24 Jul 2024 22:00:59 +0000
Subject: [PATCH 073/259] Fix test to write to %T instead of the current
 working directory (which may not be writeable)

---
 lld/test/MachO/objc-category-merging-erase-objc-name-test.s | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lld/test/MachO/objc-category-merging-erase-objc-name-test.s b/lld/test/MachO/objc-category-merging-erase-objc-name-test.s
index 01c5c4fd9e0c3..aeb2395b3a858 100644
--- a/lld/test/MachO/objc-category-merging-erase-objc-name-test.s
+++ b/lld/test/MachO/objc-category-merging-erase-objc-name-test.s
@@ -4,9 +4,9 @@
 ; then when merging the category into the base class (and deleting the category), we don't
 ; delete the 'MyTestProtocol' name
 
-; RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o erase-objc-name.o %s
-; RUN: %lld -arch arm64 -dylib -o erase-objc-name.dylib erase-objc-name.o -objc_category_merging
-; RUN: llvm-objdump --objc-meta-data --macho erase-objc-name.dylib | FileCheck %s --check-prefixes=MERGE_CATS
+; RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o %T/erase-objc-name.o %s
+; RUN: %lld -arch arm64 -dylib -o %T/erase-objc-name.dylib %T/erase-objc-name.o -objc_category_merging
+; RUN: llvm-objdump --objc-meta-data --macho %T/erase-objc-name.dylib | FileCheck %s --check-prefixes=MERGE_CATS
 
 ; === Check merge categories enabled ===
 ; Check that the original categories are not there

>From 8b094c9df34f3190b66d11d19379be1f4c89beec Mon Sep 17 00:00:00 2001
From: aaryanshukla <53713108+aaryanshukla at users.noreply.github.com>
Date: Wed, 24 Jul 2024 15:04:05 -0700
Subject: [PATCH 074/259] [libc][newheadergen]: PyYaml Version Update (#100463)

- a lot of builds had an issue using new headergen because they do not
have PyYaml installed.
---
 libc/docs/dev/header_generation.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libc/docs/dev/header_generation.rst b/libc/docs/dev/header_generation.rst
index 89036f1277b96..32ce91da83f3d 100644
--- a/libc/docs/dev/header_generation.rst
+++ b/libc/docs/dev/header_generation.rst
@@ -22,7 +22,8 @@ Instructions
 ------------
 
 Required Versions:
-  - Python Version: 3.11.8
+  - Python Version: 3.6
+  - PyYAML Version: 5.1
 
 1. Keep full-build mode on when building, otherwise headers will not be
    generated.

>From e846fb48038a34d8df3ad7412bbdcf37e9e7acc9 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas at devlieghere.com>
Date: Wed, 24 Jul 2024 15:07:38 -0700
Subject: [PATCH 075/259]  [lldb] Prevent passing a nullptr to std::string in
 ObjectFileMachO (#100421)

Prevent passing a nullptr to std::string::insert in
ObjectFileMachO::GetDependentModules. Calling GetCString on an empty
ConstString will return a nullptr, which is undefined behavior. Instead,
use the GetString helper which will return an empty string in that case.

rdar://132388027
---
 .../ObjectFile/Mach-O/ObjectFileMachO.cpp     | 207 +++++++++---------
 1 file changed, 104 insertions(+), 103 deletions(-)

diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index 2c7005449f9d7..ce095bcc48374 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -137,6 +137,9 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace llvm::MachO;
 
+static constexpr llvm::StringLiteral g_loader_path = "@loader_path";
+static constexpr llvm::StringLiteral g_executable_path = "@executable_path";
+
 LLDB_PLUGIN_DEFINE(ObjectFileMachO)
 
 static void PrintRegisterValue(RegisterContext *reg_ctx, const char *name,
@@ -5116,123 +5119,121 @@ UUID ObjectFileMachO::GetUUID() {
 }
 
 uint32_t ObjectFileMachO::GetDependentModules(FileSpecList &files) {
+  ModuleSP module_sp = GetModule();
+  if (!module_sp)
+    return 0;
+
   uint32_t count = 0;
-  ModuleSP module_sp(GetModule());
-  if (module_sp) {
-    std::lock_guard<std::recursive_mutex> guard(module_sp->GetMutex());
-    llvm::MachO::load_command load_cmd;
-    lldb::offset_t offset = MachHeaderSizeFromMagic(m_header.magic);
-    std::vector<std::string> rpath_paths;
-    std::vector<std::string> rpath_relative_paths;
-    std::vector<std::string> at_exec_relative_paths;
-    uint32_t i;
-    for (i = 0; i < m_header.ncmds; ++i) {
-      const uint32_t cmd_offset = offset;
-      if (m_data.GetU32(&offset, &load_cmd, 2) == nullptr)
-        break;
+  std::lock_guard<std::recursive_mutex> guard(module_sp->GetMutex());
+  llvm::MachO::load_command load_cmd;
+  lldb::offset_t offset = MachHeaderSizeFromMagic(m_header.magic);
+  std::vector<std::string> rpath_paths;
+  std::vector<std::string> rpath_relative_paths;
+  std::vector<std::string> at_exec_relative_paths;
+  uint32_t i;
+  for (i = 0; i < m_header.ncmds; ++i) {
+    const uint32_t cmd_offset = offset;
+    if (m_data.GetU32(&offset, &load_cmd, 2) == nullptr)
+      break;
 
-      switch (load_cmd.cmd) {
-      case LC_RPATH:
-      case LC_LOAD_DYLIB:
-      case LC_LOAD_WEAK_DYLIB:
-      case LC_REEXPORT_DYLIB:
-      case LC_LOAD_DYLINKER:
-      case LC_LOADFVMLIB:
-      case LC_LOAD_UPWARD_DYLIB: {
-        uint32_t name_offset = cmd_offset + m_data.GetU32(&offset);
-        // For LC_LOAD_DYLIB there is an alternate encoding
-        // which adds a uint32_t `flags` field for `DYLD_USE_*`
-        // flags.  This can be detected by a timestamp field with
-        // the `DYLIB_USE_MARKER` constant value.
-        bool is_delayed_init = false;
-        uint32_t use_command_marker = m_data.GetU32(&offset);
-        if (use_command_marker == 0x1a741800 /* DYLIB_USE_MARKER */) {
-          offset += 4; /* uint32_t current_version */
-          offset += 4; /* uint32_t compat_version */
-          uint32_t flags = m_data.GetU32(&offset);
-          // If this LC_LOAD_DYLIB is marked delay-init,
-          // don't report it as a dependent library -- it
-          // may be loaded in the process at some point,
-          // but will most likely not be load at launch.
-          if (flags & 0x08 /* DYLIB_USE_DELAYED_INIT */)
-            is_delayed_init = true;
-        }
-        const char *path = m_data.PeekCStr(name_offset);
-        if (path && !is_delayed_init) {
-          if (load_cmd.cmd == LC_RPATH)
-            rpath_paths.push_back(path);
-          else {
-            if (path[0] == '@') {
-              if (strncmp(path, "@rpath", strlen("@rpath")) == 0)
-                rpath_relative_paths.push_back(path + strlen("@rpath"));
-              else if (strncmp(path, "@executable_path",
-                               strlen("@executable_path")) == 0)
-                at_exec_relative_paths.push_back(path +
-                                                 strlen("@executable_path"));
-            } else {
-              FileSpec file_spec(path);
-              if (files.AppendIfUnique(file_spec))
-                count++;
-            }
+    switch (load_cmd.cmd) {
+    case LC_RPATH:
+    case LC_LOAD_DYLIB:
+    case LC_LOAD_WEAK_DYLIB:
+    case LC_REEXPORT_DYLIB:
+    case LC_LOAD_DYLINKER:
+    case LC_LOADFVMLIB:
+    case LC_LOAD_UPWARD_DYLIB: {
+      uint32_t name_offset = cmd_offset + m_data.GetU32(&offset);
+      // For LC_LOAD_DYLIB there is an alternate encoding
+      // which adds a uint32_t `flags` field for `DYLD_USE_*`
+      // flags.  This can be detected by a timestamp field with
+      // the `DYLIB_USE_MARKER` constant value.
+      bool is_delayed_init = false;
+      uint32_t use_command_marker = m_data.GetU32(&offset);
+      if (use_command_marker == 0x1a741800 /* DYLIB_USE_MARKER */) {
+        offset += 4; /* uint32_t current_version */
+        offset += 4; /* uint32_t compat_version */
+        uint32_t flags = m_data.GetU32(&offset);
+        // If this LC_LOAD_DYLIB is marked delay-init,
+        // don't report it as a dependent library -- it
+        // may be loaded in the process at some point,
+        // but will most likely not be load at launch.
+        if (flags & 0x08 /* DYLIB_USE_DELAYED_INIT */)
+          is_delayed_init = true;
+      }
+      const char *path = m_data.PeekCStr(name_offset);
+      if (path && !is_delayed_init) {
+        if (load_cmd.cmd == LC_RPATH)
+          rpath_paths.push_back(path);
+        else {
+          if (path[0] == '@') {
+            if (strncmp(path, "@rpath", strlen("@rpath")) == 0)
+              rpath_relative_paths.push_back(path + strlen("@rpath"));
+            else if (strncmp(path, "@executable_path",
+                             strlen("@executable_path")) == 0)
+              at_exec_relative_paths.push_back(path +
+                                               strlen("@executable_path"));
+          } else {
+            FileSpec file_spec(path);
+            if (files.AppendIfUnique(file_spec))
+              count++;
           }
         }
-      } break;
-
-      default:
-        break;
       }
-      offset = cmd_offset + load_cmd.cmdsize;
-    }
+    } break;
 
-    FileSpec this_file_spec(m_file);
-    FileSystem::Instance().Resolve(this_file_spec);
-
-    if (!rpath_paths.empty()) {
-      // Fixup all LC_RPATH values to be absolute paths
-      std::string loader_path("@loader_path");
-      std::string executable_path("@executable_path");
-      for (auto &rpath : rpath_paths) {
-        if (llvm::StringRef(rpath).starts_with(loader_path)) {
-          rpath.erase(0, loader_path.size());
-          rpath.insert(0, this_file_spec.GetDirectory().GetCString());
-        } else if (llvm::StringRef(rpath).starts_with(executable_path)) {
-          rpath.erase(0, executable_path.size());
-          rpath.insert(0, this_file_spec.GetDirectory().GetCString());
-        }
-      }
+    default:
+      break;
+    }
+    offset = cmd_offset + load_cmd.cmdsize;
+  }
 
-      for (const auto &rpath_relative_path : rpath_relative_paths) {
-        for (const auto &rpath : rpath_paths) {
-          std::string path = rpath;
-          path += rpath_relative_path;
-          // It is OK to resolve this path because we must find a file on disk
-          // for us to accept it anyway if it is rpath relative.
-          FileSpec file_spec(path);
-          FileSystem::Instance().Resolve(file_spec);
-          if (FileSystem::Instance().Exists(file_spec) &&
-              files.AppendIfUnique(file_spec)) {
-            count++;
-            break;
-          }
-        }
-      }
+  FileSpec this_file_spec(m_file);
+  FileSystem::Instance().Resolve(this_file_spec);
+
+  if (!rpath_paths.empty()) {
+    // Fixup all LC_RPATH values to be absolute paths.
+    const std::string this_directory =
+        this_file_spec.GetDirectory().GetString();
+    for (auto &rpath : rpath_paths) {
+      if (llvm::StringRef(rpath).starts_with(g_loader_path))
+        rpath = this_directory + rpath.substr(g_loader_path.size());
+      else if (llvm::StringRef(rpath).starts_with(g_executable_path))
+        rpath = this_directory + rpath.substr(g_executable_path.size());
     }
 
-    // We may have @executable_paths but no RPATHS.  Figure those out here.
-    // Only do this if this object file is the executable.  We have no way to
-    // get back to the actual executable otherwise, so we won't get the right
-    // path.
-    if (!at_exec_relative_paths.empty() && CalculateType() == eTypeExecutable) {
-      FileSpec exec_dir = this_file_spec.CopyByRemovingLastPathComponent();
-      for (const auto &at_exec_relative_path : at_exec_relative_paths) {
-        FileSpec file_spec =
-            exec_dir.CopyByAppendingPathComponent(at_exec_relative_path);
+    for (const auto &rpath_relative_path : rpath_relative_paths) {
+      for (const auto &rpath : rpath_paths) {
+        std::string path = rpath;
+        path += rpath_relative_path;
+        // It is OK to resolve this path because we must find a file on disk
+        // for us to accept it anyway if it is rpath relative.
+        FileSpec file_spec(path);
+        FileSystem::Instance().Resolve(file_spec);
         if (FileSystem::Instance().Exists(file_spec) &&
-            files.AppendIfUnique(file_spec))
+            files.AppendIfUnique(file_spec)) {
           count++;
+          break;
+        }
       }
     }
   }
+
+  // We may have @executable_paths but no RPATHS.  Figure those out here.
+  // Only do this if this object file is the executable.  We have no way to
+  // get back to the actual executable otherwise, so we won't get the right
+  // path.
+  if (!at_exec_relative_paths.empty() && CalculateType() == eTypeExecutable) {
+    FileSpec exec_dir = this_file_spec.CopyByRemovingLastPathComponent();
+    for (const auto &at_exec_relative_path : at_exec_relative_paths) {
+      FileSpec file_spec =
+          exec_dir.CopyByAppendingPathComponent(at_exec_relative_path);
+      if (FileSystem::Instance().Exists(file_spec) &&
+          files.AppendIfUnique(file_spec))
+        count++;
+    }
+  }
   return count;
 }
 

>From 58fb51492d9669525662fa269295d85537968569 Mon Sep 17 00:00:00 2001
From: Leandro Lupori <leandro.lupori at linaro.org>
Date: Wed, 24 Jul 2024 19:14:59 -0300
Subject: [PATCH 076/259] Revert "[flang][OpenMP] Fix copyprivate semantic
 checks" (#100478)

Reverts llvm/llvm-project#95799

This caused errors in some internal test suites.
---
 flang/include/flang/Semantics/tools.h         |   1 -
 flang/lib/Semantics/resolve-directives.cpp    | 138 ++++++------------
 flang/test/Semantics/OpenMP/copyprivate04.f90 | 112 --------------
 .../Semantics/OpenMP/do05-positivecase.f90    |   4 +-
 flang/test/Semantics/OpenMP/do20.f90          |   2 +-
 flang/test/Semantics/OpenMP/implicit-dsa.f90  |  14 +-
 flang/test/Semantics/OpenMP/reduction08.f90   |  10 +-
 flang/test/Semantics/OpenMP/reduction09.f90   |   8 +-
 flang/test/Semantics/OpenMP/symbol01.f90      |   4 +-
 flang/test/Semantics/OpenMP/symbol02.f90      |   4 +-
 flang/test/Semantics/OpenMP/symbol03.f90      |   4 +-
 flang/test/Semantics/OpenMP/symbol05.f90      |   4 +-
 flang/test/Semantics/OpenMP/symbol07.f90      |   2 +-
 flang/test/Semantics/OpenMP/symbol08.f90      |  36 ++---
 flang/test/Semantics/OpenMP/symbol09.f90      |   2 +-
 15 files changed, 95 insertions(+), 250 deletions(-)
 delete mode 100644 flang/test/Semantics/OpenMP/copyprivate04.f90

diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index f21ef28618142..0fcba3131fad1 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -87,7 +87,6 @@ bool IsIntrinsicConcat(
 bool IsGenericDefinedOp(const Symbol &);
 bool IsDefinedOperator(SourceName);
 std::string MakeOpName(SourceName);
-bool IsCommonBlockContaining(const Symbol &, const Symbol &);
 
 // Returns true if maybeAncestor exists and is a proper ancestor of a
 // descendent scope (or symbol owner).  Will be false, unlike Scope::Contains(),
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index e9811151972fe..fb32ce6837fbf 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -19,7 +19,6 @@
 #include "flang/Parser/parse-tree.h"
 #include "flang/Parser/tools.h"
 #include "flang/Semantics/expression.h"
-#include "flang/Semantics/tools.h"
 #include <list>
 #include <map>
 #include <sstream>
@@ -730,6 +729,7 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
   void CheckNameInAllocateStmt(const parser::CharBlock &source,
       const parser::Name &ompObject, const parser::AllocateStmt &allocate);
 
+  bool HasSymbolInEnclosingScope(const Symbol &, Scope &);
   std::int64_t ordCollapseLevel{0};
 
   void AddOmpRequiresToScope(Scope &, WithOmpDeclarative::RequiresFlags,
@@ -2085,22 +2085,16 @@ void OmpAttributeVisitor::Post(const parser::Name &name) {
         }
       }
 
-      // When handling each implicit rule for a given symbol, one of the
-      // following 3 actions may be taken:
-      // 1. Declare a new private symbol.
-      // 2. Create a new association symbol with no flags, that will represent
-      //    a shared symbol in the current scope. Note that symbols without
-      //    any private flags are considered as shared.
-      // 3. Use the last declared private symbol, by inserting a new symbol
-      //    in the scope being processed, associated with it.
-      //    If no private symbol was declared previously, then no association
-      //    is needed and the symbol from the enclosing scope will be
-      //    inherited by the current one.
-      //
-      // Because of how symbols are collected in lowering, not inserting a new
-      // symbol in the last case could lead to the conclusion that a symbol
-      // from an enclosing construct was declared in the current construct,
-      // which would result in wrong privatization code being generated.
+      // When handling each implicit rule, either a new private symbol is
+      // declared or the last declared symbol is used.
+      // In the latter case, it's necessary to insert a new symbol in the scope
+      // being processed, associated with the last declared symbol.
+      // This captures the fact that, although we are using the last declared
+      // symbol, its DSA could be different in this scope.
+      // Also, because of how symbols are collected in lowering, not inserting
+      // a new symbol in this scope could lead to the conclusion that the
+      // symbol was declared in this construct, which would result in wrong
+      // privatization code being generated.
       // Consider the following example:
       //
       // !$omp parallel default(private)              ! p1
@@ -2113,56 +2107,48 @@ void OmpAttributeVisitor::Post(const parser::Name &name) {
       // (p2), it would use the x symbol definition from the enclosing scope.
       // Then, when p2's default symbols were collected in lowering, the x
       // symbol from the outer parallel construct (p1) would be collected, as
-      // it would have the private flag set.
+      // it would have the private flag set (note that symbols that don't have
+      // any private flag are considered as shared).
       // This would make x appear to be defined in p2, causing it to be
       // privatized in p2 and its privatization in p1 to be skipped.
-      auto makePrivateSymbol = [&](Symbol::Flag flag) {
+      auto declNewSymbol = [&](Symbol::Flag flag) {
         Symbol *hostSymbol =
             lastDeclSymbol ? lastDeclSymbol : &symbol->GetUltimate();
         lastDeclSymbol = DeclarePrivateAccessEntity(
             *hostSymbol, flag, context_.FindScope(dirContext.directiveSource));
         return lastDeclSymbol;
       };
-      auto makeSharedSymbol = [&]() {
-        Symbol *hostSymbol =
-            lastDeclSymbol ? lastDeclSymbol : &symbol->GetUltimate();
-        MakeAssocSymbol(symbol->name(), *hostSymbol,
-            context_.FindScope(dirContext.directiveSource));
-      };
       auto useLastDeclSymbol = [&]() {
         if (lastDeclSymbol)
           MakeAssocSymbol(symbol->name(), *lastDeclSymbol,
               context_.FindScope(dirContext.directiveSource));
       };
 
-      bool taskGenDir = llvm::omp::taskGeneratingSet.test(dirContext.directive);
-      bool targetDir = llvm::omp::allTargetSet.test(dirContext.directive);
-      bool parallelDir = llvm::omp::allParallelSet.test(dirContext.directive);
-      bool teamsDir = llvm::omp::allTeamsSet.test(dirContext.directive);
-
       if (dsa.has_value()) {
-        if (dsa.value() == Symbol::Flag::OmpShared &&
-            (parallelDir || taskGenDir || teamsDir))
-          makeSharedSymbol();
-        // Private symbols will have been declared already.
+        useLastDeclSymbol();
         prevDSA = dsa;
         continue;
       }
 
+      bool taskGenDir = llvm::omp::taskGeneratingSet.test(dirContext.directive);
+      bool targetDir = llvm::omp::allTargetSet.test(dirContext.directive);
+      bool parallelDir = llvm::omp::allParallelSet.test(dirContext.directive);
+
       if (dirContext.defaultDSA == Symbol::Flag::OmpPrivate ||
           dirContext.defaultDSA == Symbol::Flag::OmpFirstPrivate ||
           dirContext.defaultDSA == Symbol::Flag::OmpShared) {
         // 1) default
         // Allowed only with parallel, teams and task generating constructs.
-        assert(parallelDir || taskGenDir || teamsDir);
+        assert(parallelDir || taskGenDir ||
+            llvm::omp::allTeamsSet.test(dirContext.directive));
         if (dirContext.defaultDSA != Symbol::Flag::OmpShared)
-          makePrivateSymbol(dirContext.defaultDSA);
+          declNewSymbol(dirContext.defaultDSA);
         else
-          makeSharedSymbol();
+          useLastDeclSymbol();
         dsa = dirContext.defaultDSA;
       } else if (parallelDir) {
         // 2) parallel -> shared
-        makeSharedSymbol();
+        useLastDeclSymbol();
         dsa = Symbol::Flag::OmpShared;
       } else if (!taskGenDir && !targetDir) {
         // 3) enclosing context
@@ -2175,12 +2161,12 @@ void OmpAttributeVisitor::Post(const parser::Name &name) {
         // TODO 5) dummy arg in orphaned taskgen construct -> firstprivate
         if (prevDSA == Symbol::Flag::OmpShared) {
           // 6) shared in enclosing context -> shared
-          makeSharedSymbol();
+          useLastDeclSymbol();
           dsa = Symbol::Flag::OmpShared;
         } else {
           // 7) firstprivate
           dsa = Symbol::Flag::OmpFirstPrivate;
-          makePrivateSymbol(*dsa)->set(Symbol::Flag::OmpImplicit);
+          declNewSymbol(*dsa)->set(Symbol::Flag::OmpImplicit);
         }
       }
       prevDSA = dsa;
@@ -2584,59 +2570,20 @@ void ResolveOmpTopLevelParts(
   });
 }
 
-static bool IsSymbolInCommonBlock(const Symbol &symbol) {
-  // TODO Improve the performance of this predicate function.
-  //      Going through all symbols sequentially, in all common blocks, can be
-  //      slow when there are many symbols. A possible optimization is to add
-  //      an OmpInCommonBlock flag to Symbol, to make it possible to quickly
-  //      test if a given symbol is in a common block.
-  for (const auto &cb : symbol.owner().commonBlocks()) {
-    if (IsCommonBlockContaining(cb.second.get(), symbol)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-static bool IsSymbolThreadprivate(const Symbol &symbol) {
+void OmpAttributeVisitor::CheckDataCopyingClause(
+    const parser::Name &name, const Symbol &symbol, Symbol::Flag ompFlag) {
+  const auto *checkSymbol{&symbol};
   if (const auto *details{symbol.detailsIf<HostAssocDetails>()}) {
-    return details->symbol().test(Symbol::Flag::OmpThreadprivate);
-  }
-  return symbol.test(Symbol::Flag::OmpThreadprivate);
-}
-
-static bool IsSymbolPrivate(const Symbol &symbol) {
-  if (symbol.test(Symbol::Flag::OmpPrivate) ||
-      symbol.test(Symbol::Flag::OmpFirstPrivate)) {
-    return true;
-  }
-  // A symbol that has not gone through constructs that may privatize the
-  // original symbol may be predetermined as private.
-  // (OMP 5.2 5.1.1 - Variables Referenced in a Construct)
-  if (symbol == symbol.GetUltimate()) {
-    switch (symbol.owner().kind()) {
-    case Scope::Kind::MainProgram:
-    case Scope::Kind::Subprogram:
-    case Scope::Kind::BlockConstruct:
-      return !symbol.attrs().test(Attr::SAVE) &&
-          !symbol.attrs().test(Attr::PARAMETER) && !IsAssumedShape(symbol) &&
-          !IsSymbolInCommonBlock(symbol);
-    default:
-      return false;
-    }
+    checkSymbol = &details->symbol();
   }
-  return false;
-}
 
-void OmpAttributeVisitor::CheckDataCopyingClause(
-    const parser::Name &name, const Symbol &symbol, Symbol::Flag ompFlag) {
   if (ompFlag == Symbol::Flag::OmpCopyIn) {
     // List of items/objects that can appear in a 'copyin' clause must be
     // 'threadprivate'
-    if (!IsSymbolThreadprivate(symbol)) {
+    if (!checkSymbol->test(Symbol::Flag::OmpThreadprivate)) {
       context_.Say(name.source,
           "Non-THREADPRIVATE object '%s' in COPYIN clause"_err_en_US,
-          symbol.name());
+          checkSymbol->name());
     }
   } else if (ompFlag == Symbol::Flag::OmpCopyPrivate &&
       GetContext().directive == llvm::omp::Directive::OMPD_single) {
@@ -2649,13 +2596,18 @@ void OmpAttributeVisitor::CheckDataCopyingClause(
           "COPYPRIVATE variable '%s' may not appear on a PRIVATE or "
           "FIRSTPRIVATE clause on a SINGLE construct"_err_en_US,
           symbol.name());
-    } else if (!IsSymbolThreadprivate(symbol) && !IsSymbolPrivate(symbol)) {
+    } else {
       // List of items/objects that can appear in a 'copyprivate' clause must be
       // either 'private' or 'threadprivate' in enclosing context.
-      context_.Say(name.source,
-          "COPYPRIVATE variable '%s' is not PRIVATE or THREADPRIVATE in "
-          "outer context"_err_en_US,
-          symbol.name());
+      if (!checkSymbol->test(Symbol::Flag::OmpThreadprivate) &&
+          !(HasSymbolInEnclosingScope(symbol, currScope()) &&
+              (symbol.test(Symbol::Flag::OmpPrivate) ||
+                  symbol.test(Symbol::Flag::OmpFirstPrivate)))) {
+        context_.Say(name.source,
+            "COPYPRIVATE variable '%s' is not PRIVATE or THREADPRIVATE in "
+            "outer context"_err_en_US,
+            symbol.name());
+      }
     }
   }
 }
@@ -2725,6 +2677,12 @@ void OmpAttributeVisitor::CheckLabelContext(const parser::CharBlock source,
   }
 }
 
+bool OmpAttributeVisitor::HasSymbolInEnclosingScope(
+    const Symbol &symbol, Scope &scope) {
+  const auto symbols{scope.parent().GetSymbols()};
+  return llvm::is_contained(symbols, symbol);
+}
+
 // Goes through the names in an OmpObjectList and checks if each name appears
 // in the given allocate statement
 void OmpAttributeVisitor::CheckAllNamesInAllocateStmt(
diff --git a/flang/test/Semantics/OpenMP/copyprivate04.f90 b/flang/test/Semantics/OpenMP/copyprivate04.f90
deleted file mode 100644
index 291cf1103fb27..0000000000000
--- a/flang/test/Semantics/OpenMP/copyprivate04.f90
+++ /dev/null
@@ -1,112 +0,0 @@
-! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
-! OpenMP Version 5.2
-! 5.1.1 - Variables Referenced in a Construct
-! Copyprivate must accept variables that are predetermined as private.
-
-module m1
-  integer :: m
-end module
-
-program omp_copyprivate
-  use m1
-  implicit none
-  integer :: i
-  integer, save :: j
-  integer :: k
-  common /c/ k
-  real, parameter :: pi = 3.14
-  integer :: a1(10)
-
-  ! Local variables are private.
-  !$omp single
-    i = 123
-  !$omp end single copyprivate(i)
-  !$omp single
-  !$omp end single copyprivate(a1)
-
-  ! Variables with the SAVE attribute are not private.
-  !$omp single
-  !ERROR: COPYPRIVATE variable 'j' is not PRIVATE or THREADPRIVATE in outer context
-  !$omp end single copyprivate(j)
-
-  ! Common block variables are not private.
-  !$omp single
-  !ERROR: COPYPRIVATE variable 'k' is not PRIVATE or THREADPRIVATE in outer context
-  !$omp end single copyprivate(/c/)
-  !$omp single
-  !ERROR: COPYPRIVATE variable 'k' is not PRIVATE or THREADPRIVATE in outer context
-  !$omp end single copyprivate(k)
-
-  ! Module variables are not private.
-  !$omp single
-  !ERROR: COPYPRIVATE variable 'm' is not PRIVATE or THREADPRIVATE in outer context
-  !$omp end single copyprivate(m)
-
-  ! Parallel can make a variable shared.
-  !$omp parallel
-    !$omp single
-      i = 456
-    !ERROR: COPYPRIVATE variable 'i' is not PRIVATE or THREADPRIVATE in outer context
-    !$omp end single copyprivate(i)
-    call sub(j, a1)
-  !$omp end parallel
-
-  !$omp parallel shared(i)
-    !$omp single
-      i = 456
-    !ERROR: COPYPRIVATE variable 'i' is not PRIVATE or THREADPRIVATE in outer context
-    !$omp end single copyprivate(i)
-  !$omp end parallel
-
-  !FIXME: an error should be emitted in this case.
-  !       copyprivate(i) should be considered as a reference to i and a new
-  !       symbol should be created in `parallel` scope, for this case to be
-  !       handled properly.
-  !$omp parallel
-    !$omp single
-    !$omp end single copyprivate(i)
-  !$omp end parallel
-
-  ! Named constants are shared.
-  !$omp single
-  !ERROR: COPYPRIVATE variable 'pi' is not PRIVATE or THREADPRIVATE in outer context
-  !$omp end single copyprivate(pi)
-
-  !$omp parallel do
-  do i = 1, 10
-    !$omp parallel
-    !$omp single
-      j = i
-    !ERROR: COPYPRIVATE variable 'i' is not PRIVATE or THREADPRIVATE in outer context
-    !$omp end single copyprivate(i)
-    !$omp end parallel
-  end do
-  !$omp end parallel do
-
-contains
-  subroutine sub(s1, a)
-    integer :: s1
-    integer :: a(:)
-
-    ! Dummy argument.
-    !$omp single
-    !$omp end single copyprivate(s1)
-
-    ! Assumed shape arrays are shared.
-    !$omp single
-    !ERROR: COPYPRIVATE variable 'a' is not PRIVATE or THREADPRIVATE in outer context
-    !$omp end single copyprivate(a)
-  end subroutine
-
-  integer function fun(f1)
-    integer :: f1
-
-    ! Dummy argument.
-    !$omp single
-    !$omp end single copyprivate(f1)
-
-    ! Function result is private.
-    !$omp single
-    !$omp end single copyprivate(fun)
-  end function
-end program
diff --git a/flang/test/Semantics/OpenMP/do05-positivecase.f90 b/flang/test/Semantics/OpenMP/do05-positivecase.f90
index 3b512a5b4f25e..4e02235f58a1a 100644
--- a/flang/test/Semantics/OpenMP/do05-positivecase.f90
+++ b/flang/test/Semantics/OpenMP/do05-positivecase.f90
@@ -20,12 +20,12 @@ program omp_do
   !$omp parallel  default(shared)
   !$omp do
   !DEF: /omp_do/OtherConstruct2/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
-  !DEF: /omp_do/OtherConstruct2/n HostAssoc INTEGER(4)
+  !REF: /omp_do/n
   do i=1,n
     !$omp parallel
     !$omp single
     !DEF: /work EXTERNAL (Subroutine) ProcEntity
-    !DEF: /omp_do/OtherConstruct2/OtherConstruct1/OtherConstruct1/i HostAssoc INTEGER(4)
+    !REF: /omp_do/OtherConstruct2/OtherConstruct1/i
     call work(i, 1)
     !$omp end single
     !$omp end parallel
diff --git a/flang/test/Semantics/OpenMP/do20.f90 b/flang/test/Semantics/OpenMP/do20.f90
index 0cafae76b86b0..915d01e69edd7 100644
--- a/flang/test/Semantics/OpenMP/do20.f90
+++ b/flang/test/Semantics/OpenMP/do20.f90
@@ -10,7 +10,7 @@ subroutine shared_iv
 
   !$omp parallel shared(i)
     !$omp single
-      !DEF: /shared_iv/OtherConstruct1/i HostAssoc INTEGER(4)
+      !REF: /shared_iv/i
       do i = 0, 1
       end do
     !$omp end single
diff --git a/flang/test/Semantics/OpenMP/implicit-dsa.f90 b/flang/test/Semantics/OpenMP/implicit-dsa.f90
index 2abe3a0e16d62..92d2421d06f97 100644
--- a/flang/test/Semantics/OpenMP/implicit-dsa.f90
+++ b/flang/test/Semantics/OpenMP/implicit-dsa.f90
@@ -15,14 +15,14 @@ subroutine implicit_dsa_test1
   !$omp task private(y) shared(z)
     !DEF: /implicit_dsa_test1/OtherConstruct1/x (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
     !DEF: /implicit_dsa_test1/OtherConstruct1/y (OmpPrivate) HostAssoc INTEGER(4)
-    !DEF: /implicit_dsa_test1/OtherConstruct1/z HostAssoc INTEGER(4)
+    !REF: /implicit_dsa_test1/z
     x = y + z
   !$omp end task
 
   !$omp task default(shared)
-    !DEF: /implicit_dsa_test1/OtherConstruct2/x HostAssoc INTEGER(4)
-    !DEF: /implicit_dsa_test1/OtherConstruct2/y HostAssoc INTEGER(4)
-    !DEF: /implicit_dsa_test1/OtherConstruct2/z HostAssoc INTEGER(4)
+    !REF: /implicit_dsa_test1/x
+    !REF: /implicit_dsa_test1/y
+    !REF: /implicit_dsa_test1/z
     x = y + z
   !$omp end task
 
@@ -61,16 +61,16 @@ subroutine implicit_dsa_test3
 
   !$omp parallel
     !$omp task
-      !DEF: /implicit_dsa_test3/OtherConstruct1/OtherConstruct1/x HostAssoc INTEGER(4)
+      !REF: /implicit_dsa_test3/x
       x = 1
-      !DEF: /implicit_dsa_test3/OtherConstruct1/OtherConstruct1/y HostAssoc INTEGER(4)
+      !REF: /implicit_dsa_test3/y
       y = 1
     !$omp end task
 
     !$omp task firstprivate(x)
       !DEF: /implicit_dsa_test3/OtherConstruct1/OtherConstruct2/x (OmpFirstPrivate) HostAssoc INTEGER(4)
       x = 1
-      !DEF: /implicit_dsa_test3/OtherConstruct1/OtherConstruct2/z HostAssoc INTEGER(4)
+      !REF: /implicit_dsa_test3/z
       z = 1
     !$omp end task
   !$omp end parallel
diff --git a/flang/test/Semantics/OpenMP/reduction08.f90 b/flang/test/Semantics/OpenMP/reduction08.f90
index 9442fbd4d5978..99163327cdafa 100644
--- a/flang/test/Semantics/OpenMP/reduction08.f90
+++ b/flang/test/Semantics/OpenMP/reduction08.f90
@@ -15,7 +15,7 @@ program omp_reduction
   do i=1,10
     !DEF: /omp_reduction/OtherConstruct1/k (OmpReduction) HostAssoc INTEGER(4)
     !DEF: /omp_reduction/max ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !DEF: /omp_reduction/OtherConstruct1/m HostAssoc INTEGER(4)
+    !REF: /omp_reduction/m
     k = max(k, m)
   end do
   !$omp end parallel do
@@ -25,7 +25,7 @@ program omp_reduction
   do i=1,10
     !DEF: /omp_reduction/OtherConstruct2/k (OmpReduction) HostAssoc INTEGER(4)
     !DEF: /omp_reduction/min ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !DEF: /omp_reduction/OtherConstruct2/m HostAssoc INTEGER(4)
+    !REF: /omp_reduction/m
     k = min(k, m)
   end do
   !$omp end parallel do
@@ -35,7 +35,7 @@ program omp_reduction
   do i=1,10
     !DEF: /omp_reduction/OtherConstruct3/k (OmpReduction) HostAssoc INTEGER(4)
     !DEF: /omp_reduction/iand ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !DEF: /omp_reduction/OtherConstruct3/m HostAssoc INTEGER(4)
+    !REF: /omp_reduction/m
     k = iand(k, m)
   end do
   !$omp end parallel do
@@ -45,7 +45,7 @@ program omp_reduction
   do i=1,10
     !DEF: /omp_reduction/OtherConstruct4/k (OmpReduction) HostAssoc INTEGER(4)
     !DEF: /omp_reduction/ior ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !DEF: /omp_reduction/OtherConstruct4/m HostAssoc INTEGER(4)
+    !REF: /omp_reduction/m
     k = ior(k, m)
   end do
   !$omp end parallel do
@@ -55,7 +55,7 @@ program omp_reduction
   do i=1,10
     !DEF: /omp_reduction/OtherConstruct5/k (OmpReduction) HostAssoc INTEGER(4)
     !DEF: /omp_reduction/ieor ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !DEF: /omp_reduction/OtherConstruct5/m HostAssoc INTEGER(4)
+    !REF: /omp_reduction/m
     k = ieor(k,m)
   end do
   !$omp end parallel do
diff --git a/flang/test/Semantics/OpenMP/reduction09.f90 b/flang/test/Semantics/OpenMP/reduction09.f90
index 1af2fc4fd9691..dbc8d1b060e65 100644
--- a/flang/test/Semantics/OpenMP/reduction09.f90
+++ b/flang/test/Semantics/OpenMP/reduction09.f90
@@ -26,7 +26,7 @@ program omp_reduction
   !$omp parallel do  reduction(+:a(10))
   !DEF: /omp_reduction/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct2/k HostAssoc INTEGER(4)
+    !REF: /omp_reduction/k
     k = k+1
   end do
   !$omp end parallel do
@@ -35,7 +35,7 @@ program omp_reduction
   !$omp parallel do  reduction(+:a(1:10:1))
   !DEF: /omp_reduction/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct3/k HostAssoc INTEGER(4)
+    !REF: /omp_reduction/k
     k = k+1
   end do
   !$omp end parallel do
@@ -43,7 +43,7 @@ program omp_reduction
   !$omp parallel do  reduction(+:b(1:10:1,1:5,2))
   !DEF: /omp_reduction/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct4/k HostAssoc INTEGER(4)
+    !REF: /omp_reduction/k
     k = k+1
   end do
   !$omp end parallel do
@@ -51,7 +51,7 @@ program omp_reduction
   !$omp parallel do  reduction(+:b(1:10:1,1:5,2:5:1))
   !DEF: /omp_reduction/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct5/k HostAssoc INTEGER(4)
+    !REF: /omp_reduction/k
     k = k+1
   end do
   !$omp end parallel do
diff --git a/flang/test/Semantics/OpenMP/symbol01.f90 b/flang/test/Semantics/OpenMP/symbol01.f90
index ecfb8622f8179..0b435a9ab9850 100644
--- a/flang/test/Semantics/OpenMP/symbol01.f90
+++ b/flang/test/Semantics/OpenMP/symbol01.f90
@@ -48,7 +48,7 @@ program mm
  !DEF: /mm/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
  do i=1,10
   !DEF: /mm/OtherConstruct1/a (OmpPrivate) HostAssoc REAL(4)
-  !DEF: /mm/OtherConstruct1/b HostAssoc INTEGER(4)
+  !REF: /mm/b
   !REF: /mm/OtherConstruct1/i
   a = a+b(i)
   !DEF: /mm/OtherConstruct1/t (OmpPrivate) HostAssoc TYPE(myty)
@@ -62,7 +62,7 @@ program mm
   !REF: /mm/OtherConstruct1/i
   !REF: /mm/OtherConstruct1/y
   x = a+i+y
-  !DEF: /mm/OtherConstruct1/c HostAssoc REAL(4)
+  !REF: /mm/c
   c = 3.0
  end do
 end program
diff --git a/flang/test/Semantics/OpenMP/symbol02.f90 b/flang/test/Semantics/OpenMP/symbol02.f90
index c199c526e1fa8..f6ffc5500d0a4 100644
--- a/flang/test/Semantics/OpenMP/symbol02.f90
+++ b/flang/test/Semantics/OpenMP/symbol02.f90
@@ -15,9 +15,9 @@
   a = 3.
   !DEF: /MainProgram1/OtherConstruct1/b (OmpPrivate) HostAssoc REAL(4)
   b = 4
-  !DEF: /MainProgram1/OtherConstruct1/c HostAssoc REAL(4)
+  !REF: /MainProgram1/c
   c = 5
-  !DEF: /MainProgram1/OtherConstruct1/d HostAssoc REAL(4)
+  !DEF: /MainProgram1/d (Implicit) ObjectEntity REAL(4)
   d = 6
   !$omp end parallel
   !DEF: /MainProgram1/a (Implicit) ObjectEntity REAL(4)
diff --git a/flang/test/Semantics/OpenMP/symbol03.f90 b/flang/test/Semantics/OpenMP/symbol03.f90
index ba941b9c9e7c4..93e9b7a3eae6b 100644
--- a/flang/test/Semantics/OpenMP/symbol03.f90
+++ b/flang/test/Semantics/OpenMP/symbol03.f90
@@ -9,10 +9,10 @@
   !$omp parallel  private(a) shared(b)
   !DEF: /MainProgram1/OtherConstruct1/a (OmpPrivate) HostAssoc REAL(4)
   a = 3.
-  !DEF: /MainProgram1/OtherConstruct1/b HostAssoc REAL(4)
+  !REF: /MainProgram1/b
   b = 4
   !$omp parallel  private(b) shared(a)
-  !DEF: /MainProgram1/OtherConstruct1/OtherConstruct1/a HostAssoc REAL(4)
+  !REF: /MainProgram1/OtherConstruct1/a
   a = 5.
   !DEF: /MainProgram1/OtherConstruct1/OtherConstruct1/b (OmpPrivate) HostAssoc REAL(4)
   b = 6
diff --git a/flang/test/Semantics/OpenMP/symbol05.f90 b/flang/test/Semantics/OpenMP/symbol05.f90
index 1ad0c10a40135..fa0a8f65a4294 100644
--- a/flang/test/Semantics/OpenMP/symbol05.f90
+++ b/flang/test/Semantics/OpenMP/symbol05.f90
@@ -15,10 +15,10 @@ subroutine foo
     !DEF: /mm/foo/a ObjectEntity INTEGER(4)
     integer :: a = 3
     !$omp parallel
-    !DEF: /mm/foo/OtherConstruct1/a HostAssoc INTEGER(4)
+    !REF: /mm/foo/a
     a = 1
     !DEF: /mm/i PUBLIC (Implicit, OmpThreadprivate) ObjectEntity INTEGER(4)
-    !REF: /mm/foo/OtherConstruct1/a
+    !REF: /mm/foo/a
     i = a
     !$omp end parallel
     !REF: /mm/foo/a
diff --git a/flang/test/Semantics/OpenMP/symbol07.f90 b/flang/test/Semantics/OpenMP/symbol07.f90
index 8b4716999820b..e2250f5c7908a 100644
--- a/flang/test/Semantics/OpenMP/symbol07.f90
+++ b/flang/test/Semantics/OpenMP/symbol07.f90
@@ -23,7 +23,7 @@ subroutine function_call_in_region
   !$omp parallel  default(none) private(a) shared(b)
   !DEF: /function_call_in_region/OtherConstruct1/a (OmpPrivate) HostAssoc REAL(4)
   !REF: /function_call_in_region/foo
-  !DEF: /function_call_in_region/OtherConstruct1/b HostAssoc REAL(4)
+  !REF: /function_call_in_region/b
   a = foo(b)
   !$omp end parallel
   !REF: /function_call_in_region/a
diff --git a/flang/test/Semantics/OpenMP/symbol08.f90 b/flang/test/Semantics/OpenMP/symbol08.f90
index 69ccd17391b54..3af85af74ee97 100644
--- a/flang/test/Semantics/OpenMP/symbol08.f90
+++ b/flang/test/Semantics/OpenMP/symbol08.f90
@@ -28,18 +28,18 @@ subroutine test_do
  !DEF: /test_do/k ObjectEntity INTEGER(4)
  integer i, j, k
 !$omp parallel
- !DEF: /test_do/OtherConstruct1/i HostAssoc INTEGER(4)
+ !REF: /test_do/i
  i = 99
 !$omp do  collapse(2)
  !DEF: /test_do/OtherConstruct1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
  do i=1,5
   !DEF: /test_do/OtherConstruct1/OtherConstruct1/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do j=6,10
-   !DEF: /test_do/OtherConstruct1/a HostAssoc REAL(4)
+   !REF: /test_do/a
    a(1,1,1) = 0.
    !DEF: /test_do/OtherConstruct1/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
    do k=11,15
-    !REF: /test_do/OtherConstruct1/a
+    !REF: /test_do/a
     !REF: /test_do/OtherConstruct1/k
     !REF: /test_do/OtherConstruct1/OtherConstruct1/j
     !REF: /test_do/OtherConstruct1/OtherConstruct1/i
@@ -65,11 +65,11 @@ subroutine test_pardo
  do i=1,5
    !DEF: /test_pardo/OtherConstruct1/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
     do j=6,10
-   !DEF: /test_pardo/OtherConstruct1/a HostAssoc REAL(4)
+   !REF: /test_pardo/a
    a(1,1,1) = 0.
    !DEF: /test_pardo/OtherConstruct1/k (OmpPrivate) HostAssoc INTEGER(4)
    do k=11,15
-    !REF: /test_pardo/OtherConstruct1/a
+    !REF: /test_pardo/a
     !REF: /test_pardo/OtherConstruct1/k
     !REF: /test_pardo/OtherConstruct1/j
     !REF: /test_pardo/OtherConstruct1/i
@@ -138,15 +138,15 @@ subroutine dotprod (b, c, n, block_size, num_teams, block_threads)
  do i0=1,n,block_size
 !$omp parallel do  reduction(+:sum)
   !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
-  !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/i0 HostAssoc INTEGER(4)
+  !REF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/i0
   !DEF: /dotprod/min ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-  !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/block_size HostAssoc INTEGER(4)
-  !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/n HostAssoc INTEGER(4)
+  !REF: /dotprod/block_size
+  !REF: /dotprod/n
   do i=i0,min(i0+block_size, n)
    !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/sum (OmpReduction) HostAssoc REAL(4)
-   !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/b HostAssoc REAL(4)
+   !REF: /dotprod/b
    !REF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/i
-   !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/c HostAssoc REAL(4)
+   !REF: /dotprod/c
    sum = sum+b(i)*c(i)
   end do
  end do
@@ -174,7 +174,7 @@ subroutine test_simd
   do j=6,10
    !DEF: /test_simd/OtherConstruct1/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
    do k=11,15
-    !DEF: /test_simd/OtherConstruct1/a HostAssoc REAL(4)
+    !REF: /test_simd/a
     !REF: /test_simd/OtherConstruct1/k
     !REF: /test_simd/OtherConstruct1/j
     !REF: /test_simd/OtherConstruct1/i
@@ -201,7 +201,7 @@ subroutine test_simd_multi
   do j=6,10
    !DEF: /test_simd_multi/OtherConstruct1/k (OmpLastPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
    do k=11,15
-    !DEF: /test_simd_multi/OtherConstruct1/a HostAssoc REAL(4)
+    !REF: /test_simd_multi/a
     !REF: /test_simd_multi/OtherConstruct1/k
     !REF: /test_simd_multi/OtherConstruct1/j
     !REF: /test_simd_multi/OtherConstruct1/i
@@ -223,11 +223,11 @@ subroutine test_seq_loop
   !REF: /test_seq_loop/j
   j = -1
   !$omp parallel
-  !DEF: /test_seq_loop/OtherConstruct1/i HostAssoc INTEGER(4)
-  !DEF: /test_seq_loop/OtherConstruct1/j HostAssoc INTEGER(4)
+  !REF: /test_seq_loop/i
+  !REF: /test_seq_loop/j
   print *, i, j
   !$omp parallel
-  !DEF: /test_seq_loop/OtherConstruct1/OtherConstruct1/i HostAssoc INTEGER(4)
+  !REF: /test_seq_loop/i
   !DEF: /test_seq_loop/OtherConstruct1/OtherConstruct1/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   print *, i, j
   !$omp do
@@ -237,12 +237,12 @@ subroutine test_seq_loop
    do j=1,10
    end do
   end do
-  !REF: /test_seq_loop/OtherConstruct1/OtherConstruct1/i
+  !REF: /test_seq_loop/i
   !REF: /test_seq_loop/OtherConstruct1/OtherConstruct1/j
   print *, i, j
   !$omp end parallel
-  !REF: /test_seq_loop/OtherConstruct1/i
-  !REF: /test_seq_loop/OtherConstruct1/j
+  !REF: /test_seq_loop/i
+  !REF: /test_seq_loop/j
   print *, i, j
   !$omp end parallel
   !REF: /test_seq_loop/i
diff --git a/flang/test/Semantics/OpenMP/symbol09.f90 b/flang/test/Semantics/OpenMP/symbol09.f90
index 8b4716999820b..e2250f5c7908a 100644
--- a/flang/test/Semantics/OpenMP/symbol09.f90
+++ b/flang/test/Semantics/OpenMP/symbol09.f90
@@ -23,7 +23,7 @@ subroutine function_call_in_region
   !$omp parallel  default(none) private(a) shared(b)
   !DEF: /function_call_in_region/OtherConstruct1/a (OmpPrivate) HostAssoc REAL(4)
   !REF: /function_call_in_region/foo
-  !DEF: /function_call_in_region/OtherConstruct1/b HostAssoc REAL(4)
+  !REF: /function_call_in_region/b
   a = foo(b)
   !$omp end parallel
   !REF: /function_call_in_region/a

>From b79568654e38a14ef921af932ed96abd8961b1ed Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight at google.com>
Date: Wed, 24 Jul 2024 18:39:16 -0400
Subject: [PATCH 077/259] Clang: adjust MMX instrinsics release note.

---
 clang/docs/ReleaseNotes.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index e2a680d73ab05..0b79e952b48af 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -178,7 +178,9 @@ X86 Support
   unconditionally. These intrinsics are therefore *no longer
   supported* if MMX is enabled without SSE2 -- either from targeting
   CPUs from the Pentium-MMX through the Pentium 3, or explicitly via
-  passing arguments such as ``-mmmx -mno-sse2``.
+  passing arguments such as ``-mmmx -mno-sse2``. MMX assembly code
+  remains supported without requiring SSE2, including inside
+  inline-assembly.
 
 - The compiler builtins such as ``__builtin_ia32_paddb`` which
   formerly implemented the above MMX intrinsic functions have been

>From 3f6eb13abf643afec17a73448ede380606531226 Mon Sep 17 00:00:00 2001
From: Daniil Kovalev <dkovalev at accesssoftek.com>
Date: Thu, 25 Jul 2024 02:13:30 +0300
Subject: [PATCH 078/259] [PAC][clang] Enable `-fptrauth-indirect-gotos` as
 part of pauthtest ABI (#100480)

---
 clang/lib/Driver/ToolChains/Clang.cpp | 4 ++++
 clang/test/Driver/aarch64-ptrauth.c   | 6 +++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 752a71739eeb5..d3edda3f10543 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1512,6 +1512,10 @@ static void handlePAuthABI(const ArgList &DriverArgs, ArgStringList &CC1Args) {
           options::OPT_fno_ptrauth_vtable_pointer_type_discrimination))
     CC1Args.push_back("-fptrauth-vtable-pointer-type-discrimination");
 
+  if (!DriverArgs.hasArg(options::OPT_fptrauth_indirect_gotos,
+                         options::OPT_fno_ptrauth_indirect_gotos))
+    CC1Args.push_back("-fptrauth-indirect-gotos");
+
   if (!DriverArgs.hasArg(options::OPT_fptrauth_init_fini,
                          options::OPT_fno_ptrauth_init_fini))
     CC1Args.push_back("-fptrauth-init-fini");
diff --git a/clang/test/Driver/aarch64-ptrauth.c b/clang/test/Driver/aarch64-ptrauth.c
index 20899b8920cf9..c8e3aeef1640a 100644
--- a/clang/test/Driver/aarch64-ptrauth.c
+++ b/clang/test/Driver/aarch64-ptrauth.c
@@ -21,16 +21,16 @@
 // RUN: %clang -### -c --target=aarch64-linux-pauthtest %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI1
 // PAUTHABI1:      "-cc1"{{.*}} "-triple" "aarch64-unknown-linux-pauthtest"
 // PAUTHABI1-SAME: "-target-abi" "pauthtest"
-// PAUTHABI1-SAME: "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-init-fini"
+// PAUTHABI1-SAME: "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-indirect-gotos" "-fptrauth-init-fini"
 
 // RUN: %clang -### -c --target=aarch64 -mabi=pauthtest -fno-ptrauth-intrinsics \
 // RUN:   -fno-ptrauth-calls -fno-ptrauth-returns -fno-ptrauth-auth-traps \
 // RUN:   -fno-ptrauth-vtable-pointer-address-discrimination -fno-ptrauth-vtable-pointer-type-discrimination \
-// RUN:   -fno-ptrauth-init-fini %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI2
+// RUN:   -fno-ptrauth-indirect-gotos -fno-ptrauth-init-fini %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI2
 // RUN: %clang -### -c --target=aarch64-pauthtest -fno-ptrauth-intrinsics \
 // RUN:   -fno-ptrauth-calls -fno-ptrauth-returns -fno-ptrauth-auth-traps \
 // RUN:   -fno-ptrauth-vtable-pointer-address-discrimination -fno-ptrauth-vtable-pointer-type-discrimination \
-// RUN:   -fno-ptrauth-init-fini %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI2
+// RUN:   -fno-ptrauth-indirect-gotos -fno-ptrauth-init-fini %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI2
 // PAUTHABI2:     "-cc1"
 // PAUTHABI2-NOT: "-fptrauth-
 

>From 7b51777ed89969ae86a0714565d195faf394b7db Mon Sep 17 00:00:00 2001
From: Job Henandez Lara <hj93 at protonmail.com>
Date: Wed, 24 Jul 2024 16:53:23 -0700
Subject: [PATCH 079/259] [libc][math][c23] add entrypoints and tests for
 totalordermag{f,l,f128} (#100159)

Fixes https://github.com/llvm/llvm-project/issues/100139
---
 libc/config/darwin/arm/entrypoints.txt        |  3 ++
 libc/config/darwin/x86_64/entrypoints.txt     |  3 ++
 libc/config/linux/aarch64/entrypoints.txt     |  4 ++
 libc/config/linux/arm/entrypoints.txt         |  3 ++
 libc/config/linux/riscv/entrypoints.txt       |  4 ++
 libc/config/linux/x86_64/entrypoints.txt      |  4 ++
 libc/docs/math/index.rst                      |  2 +-
 libc/spec/stdc.td                             |  9 ++--
 libc/src/math/CMakeLists.txt                  |  4 ++
 libc/src/math/generic/CMakeLists.txt          | 49 +++++++++++++++++++
 libc/src/math/generic/totalordermag.cpp       | 20 ++++++++
 libc/src/math/generic/totalordermagf.cpp      | 20 ++++++++
 libc/src/math/generic/totalordermagf128.cpp   | 21 ++++++++
 libc/src/math/generic/totalordermagl.cpp      | 21 ++++++++
 libc/src/math/totalordermag.h                 | 20 ++++++++
 libc/src/math/totalordermagf.h                | 20 ++++++++
 libc/src/math/totalordermagf128.h             | 21 ++++++++
 libc/src/math/totalordermagl.h                | 20 ++++++++
 libc/test/src/math/smoke/CMakeLists.txt       | 48 ++++++++++++++++++
 .../src/math/smoke/totalordermag_test.cpp     | 13 +++++
 .../src/math/smoke/totalordermagf128_test.cpp | 13 +++++
 .../src/math/smoke/totalordermagf_test.cpp    | 13 +++++
 .../src/math/smoke/totalordermagl_test.cpp    | 13 +++++
 23 files changed, 344 insertions(+), 4 deletions(-)
 create mode 100644 libc/src/math/generic/totalordermag.cpp
 create mode 100644 libc/src/math/generic/totalordermagf.cpp
 create mode 100644 libc/src/math/generic/totalordermagf128.cpp
 create mode 100644 libc/src/math/generic/totalordermagl.cpp
 create mode 100644 libc/src/math/totalordermag.h
 create mode 100644 libc/src/math/totalordermagf.h
 create mode 100644 libc/src/math/totalordermagf128.h
 create mode 100644 libc/src/math/totalordermagl.h
 create mode 100644 libc/test/src/math/smoke/totalordermag_test.cpp
 create mode 100644 libc/test/src/math/smoke/totalordermagf128_test.cpp
 create mode 100644 libc/test/src/math/smoke/totalordermagf_test.cpp
 create mode 100644 libc/test/src/math/smoke/totalordermagl_test.cpp

diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt
index ea5c7b537bbec..3b50666b968e8 100644
--- a/libc/config/darwin/arm/entrypoints.txt
+++ b/libc/config/darwin/arm/entrypoints.txt
@@ -241,6 +241,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.tan
     libc.src.math.tanf
     libc.src.math.tanhf
+    libc.src.math.totalordermag
+    libc.src.math.totalordermagf
+    libc.src.math.totalordermagl
     libc.src.math.trunc
     libc.src.math.truncf
     libc.src.math.truncl
diff --git a/libc/config/darwin/x86_64/entrypoints.txt b/libc/config/darwin/x86_64/entrypoints.txt
index 1a7353172d464..df9f9bfd54e81 100644
--- a/libc/config/darwin/x86_64/entrypoints.txt
+++ b/libc/config/darwin/x86_64/entrypoints.txt
@@ -207,6 +207,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     #libc.src.math.sqrtl
     #libc.src.math.tanf
     #libc.src.math.tanhf
+    #libc.src.math.totalordermag
+    #libc.src.math.totalordermagf
+    #libc.src.math.totalordermagl
     #libc.src.math.trunc
     #libc.src.math.truncf
     #libc.src.math.truncl
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 0be6f884f0368..2334fed773702 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -514,6 +514,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.tan
     libc.src.math.tanf
     libc.src.math.tanhf
+    libc.src.math.totalordermag
+    libc.src.math.totalordermagf
+    libc.src.math.totalordermagl
     libc.src.math.trunc
     libc.src.math.truncf
     libc.src.math.truncl
@@ -637,6 +640,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.roundevenf128
     libc.src.math.scalbnf128
     libc.src.math.sqrtf128
+    libc.src.math.totalordermagf128
     libc.src.math.truncf128
     libc.src.math.ufromfpf128
     libc.src.math.ufromfpxf128
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 55f118395c22e..61ee68ac66082 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -373,6 +373,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.tan
     libc.src.math.tanf
     libc.src.math.tanhf
+    libc.src.math.totalordermag
+    libc.src.math.totalordermagf
+    libc.src.math.totalordermagl
     libc.src.math.trunc
     libc.src.math.truncf
     libc.src.math.truncl
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 597aeb361afe4..07466805b34cd 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -542,6 +542,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.tan
     libc.src.math.tanf
     libc.src.math.tanhf
+    libc.src.math.totalordermag
+    libc.src.math.totalordermagf
+    libc.src.math.totalordermagl
     libc.src.math.trunc
     libc.src.math.truncf
     libc.src.math.truncl
@@ -600,6 +603,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.roundf128
     libc.src.math.scalbnf128
     libc.src.math.sqrtf128
+    libc.src.math.totalordermagf128
     libc.src.math.truncf128
     libc.src.math.ufromfpf128
     libc.src.math.ufromfpxf128
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index f7813fc16ff7c..035ceb8ca57bf 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -542,6 +542,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.tan
     libc.src.math.tanf
     libc.src.math.tanhf
+    libc.src.math.totalordermag
+    libc.src.math.totalordermagf
+    libc.src.math.totalordermagl
     libc.src.math.trunc
     libc.src.math.truncf
     libc.src.math.truncl
@@ -686,6 +689,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.roundf128
     libc.src.math.scalbnf128
     libc.src.math.sqrtf128
+    libc.src.math.totalordermagf128
     libc.src.math.truncf128
     libc.src.math.ufromfpf128
     libc.src.math.ufromfpxf128
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index f287c16fd01e2..64de548a1ea1c 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -230,7 +230,7 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | totalorder       |                  |                 |                        | |check|              |                        | F.10.12.1              | N/A                        |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| totalordermag    |                  |                 |                        | |check|              |                        | F.10.12.2              | N/A                        |
+| totalordermag    | |check|          | |check|         | |check|                | |check|              | |check|                | F.10.12.2              | N/A                        |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | trunc            | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.9.9               | F.10.6.9                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 0aae65308d33a..d5a5cb6fedb4b 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -720,10 +720,13 @@ def StdC : StandardSpec<"stdc"> {
 	  
           FunctionSpec<"dsqrtl", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>]>,
 	
+	  GuardedFunctionSpec<"totalorderf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
 
-          GuardedFunctionSpec<"totalorderf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          GuardedFunctionSpec<"totalordermagf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
+          FunctionSpec<"totalordermag", RetValSpec<IntType>, [ArgSpec<DoublePtr>, ArgSpec<DoublePtr>]>,
+	  FunctionSpec<"totalordermagf", RetValSpec<IntType>, [ArgSpec<FloatPtr>, ArgSpec<FloatPtr>]>,
+	  FunctionSpec<"totalordermagl", RetValSpec<IntType>, [ArgSpec<LongDoublePtr>, ArgSpec<LongDoublePtr>]>,
+	  GuardedFunctionSpec<"totalordermagf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
+	  GuardedFunctionSpec<"totalordermagf128", RetValSpec<IntType>, [ArgSpec<Float128Ptr>, ArgSpec<Float128Ptr>], "LIBC_TYPES_HAS_FLOAT128">,
 
           GuardedFunctionSpec<"getpayloadf16", RetValSpec<Float16Type>, [ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
 
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 25aef3f72e3cd..d70af33522d2b 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -435,7 +435,11 @@ add_math_entrypoint_object(tgammaf)
 
 add_math_entrypoint_object(totalorderf16)
 
+add_math_entrypoint_object(totalordermag)
+add_math_entrypoint_object(totalordermagf)
+add_math_entrypoint_object(totalordermagl)
 add_math_entrypoint_object(totalordermagf16)
+add_math_entrypoint_object(totalordermagf128)
 
 add_math_entrypoint_object(trunc)
 add_math_entrypoint_object(truncf)
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index ef96d6c4c2f37..80e862542e29c 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -3965,6 +3965,42 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  totalordermag
+  SRCS
+    totalordermag.cpp
+  HDRS
+    ../totalordermag.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  totalordermagf
+  SRCS
+    totalordermagf.cpp
+  HDRS
+    ../totalordermagf.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  totalordermagl
+  SRCS
+    totalordermagl.cpp
+  HDRS
+    ../totalordermagl.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   totalordermagf16
   SRCS
@@ -3977,6 +4013,19 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  totalordermagf128
+  SRCS
+    totalordermagf128.cpp
+  HDRS
+    ../totalordermagf128.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.macros.properties.types
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   getpayloadf16
   SRCS
diff --git a/libc/src/math/generic/totalordermag.cpp b/libc/src/math/generic/totalordermag.cpp
new file mode 100644
index 0000000000000..fb8215cad68fe
--- /dev/null
+++ b/libc/src/math/generic/totalordermag.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of totalordermag function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/totalordermag.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, totalordermag, (const double *x, const double *y)) {
+  return static_cast<int>(fputil::totalordermag(*x, *y));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/totalordermagf.cpp b/libc/src/math/generic/totalordermagf.cpp
new file mode 100644
index 0000000000000..6b93bac51d0fb
--- /dev/null
+++ b/libc/src/math/generic/totalordermagf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of totalordermagf function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/totalordermagf.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, totalordermagf, (const float *x, const float *y)) {
+  return static_cast<int>(fputil::totalordermag(*x, *y));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/totalordermagf128.cpp b/libc/src/math/generic/totalordermagf128.cpp
new file mode 100644
index 0000000000000..b32abfa8d09b1
--- /dev/null
+++ b/libc/src/math/generic/totalordermagf128.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of totalordermagf128 function ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/totalordermagf128.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, totalordermagf128,
+                   (const float128 *x, const float128 *y)) {
+  return static_cast<int>(fputil::totalordermag(*x, *y));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/totalordermagl.cpp b/libc/src/math/generic/totalordermagl.cpp
new file mode 100644
index 0000000000000..49eed62bf99eb
--- /dev/null
+++ b/libc/src/math/generic/totalordermagl.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of totalordermagl function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/totalordermagl.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, totalordermagl,
+                   (const long double *x, const long double *y)) {
+  return static_cast<int>(fputil::totalordermag(*x, *y));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/totalordermag.h b/libc/src/math/totalordermag.h
new file mode 100644
index 0000000000000..cfdd868add674
--- /dev/null
+++ b/libc/src/math/totalordermag.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for totalordermag -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_TOTALORDERMAG_H
+#define LLVM_LIBC_SRC_MATH_TOTALORDERMAG_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int totalordermag(const double *x, const double *y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_TOTALORDERMAG_H
diff --git a/libc/src/math/totalordermagf.h b/libc/src/math/totalordermagf.h
new file mode 100644
index 0000000000000..47ddd831808f5
--- /dev/null
+++ b/libc/src/math/totalordermagf.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for totalordermagf ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_TOTALORDERMAGF_H
+#define LLVM_LIBC_SRC_MATH_TOTALORDERMAGF_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int totalordermagf(const float *x, const float *y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_TOTALORDERMAGF_H
diff --git a/libc/src/math/totalordermagf128.h b/libc/src/math/totalordermagf128.h
new file mode 100644
index 0000000000000..389d0596b9d4b
--- /dev/null
+++ b/libc/src/math/totalordermagf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for totalordermagf128 -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_TOTALORDERMAGF128_H
+#define LLVM_LIBC_SRC_MATH_TOTALORDERMAGF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int totalordermagf128(const float128 *x, const float128 *y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_TOTALORDERMAGF128_H
diff --git a/libc/src/math/totalordermagl.h b/libc/src/math/totalordermagl.h
new file mode 100644
index 0000000000000..64d27f1566877
--- /dev/null
+++ b/libc/src/math/totalordermagl.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for totalordermag -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_TOTALORDERMAGL_H
+#define LLVM_LIBC_SRC_MATH_TOTALORDERMAGL_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int totalordermagl(const long double *x, const long double *y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_TOTALORDERMAGl_H
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 1b3c51739c0fe..5ddc88a38eb64 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -3643,6 +3643,42 @@ add_fp_unittest(
     libc.src.math.totalorderf16
 )
 
+add_fp_unittest(
+  totalordermag_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    totalordermag_test.cpp
+  HDRS
+    TotalOrderMagTest.h
+  DEPENDS
+    libc.src.math.totalordermag
+)
+
+add_fp_unittest(
+  totalordermagf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    totalordermagf_test.cpp
+  HDRS
+    TotalOrderMagTest.h
+  DEPENDS
+    libc.src.math.totalordermagf
+)
+
+add_fp_unittest(
+  totalordermagl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    totalordermagl_test.cpp
+  HDRS
+    TotalOrderMagTest.h
+  DEPENDS
+    libc.src.math.totalordermagl
+)
+
 add_fp_unittest(
   totalordermagf16_test
   SUITE
@@ -3655,6 +3691,18 @@ add_fp_unittest(
     libc.src.math.totalordermagf16
 )
 
+add_fp_unittest(
+  totalordermagf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    totalordermagf128_test.cpp
+  HDRS
+    TotalOrderMagTest.h
+  DEPENDS
+    libc.src.math.totalordermagf128
+)
+
 add_fp_unittest(
   getpayloadf16_test
   SUITE
diff --git a/libc/test/src/math/smoke/totalordermag_test.cpp b/libc/test/src/math/smoke/totalordermag_test.cpp
new file mode 100644
index 0000000000000..6c3b6ddbd7422
--- /dev/null
+++ b/libc/test/src/math/smoke/totalordermag_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for totalordermag ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TotalOrderMagTest.h"
+
+#include "src/math/totalordermag.h"
+
+LIST_TOTALORDERMAG_TESTS(double, LIBC_NAMESPACE::totalordermag)
diff --git a/libc/test/src/math/smoke/totalordermagf128_test.cpp b/libc/test/src/math/smoke/totalordermagf128_test.cpp
new file mode 100644
index 0000000000000..6cbe24ff8778f
--- /dev/null
+++ b/libc/test/src/math/smoke/totalordermagf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for totalordermagf128 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TotalOrderMagTest.h"
+
+#include "src/math/totalordermagf128.h"
+
+LIST_TOTALORDERMAG_TESTS(float128, LIBC_NAMESPACE::totalordermagf128)
diff --git a/libc/test/src/math/smoke/totalordermagf_test.cpp b/libc/test/src/math/smoke/totalordermagf_test.cpp
new file mode 100644
index 0000000000000..c54cea21a52c7
--- /dev/null
+++ b/libc/test/src/math/smoke/totalordermagf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for totalordermagf --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TotalOrderMagTest.h"
+
+#include "src/math/totalordermagf.h"
+
+LIST_TOTALORDERMAG_TESTS(float, LIBC_NAMESPACE::totalordermagf)
diff --git a/libc/test/src/math/smoke/totalordermagl_test.cpp b/libc/test/src/math/smoke/totalordermagl_test.cpp
new file mode 100644
index 0000000000000..bdb9f616aded7
--- /dev/null
+++ b/libc/test/src/math/smoke/totalordermagl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for totalordermagl --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TotalOrderMagTest.h"
+
+#include "src/math/totalordermagl.h"
+
+LIST_TOTALORDERMAG_TESTS(long double, LIBC_NAMESPACE::totalordermagl)

>From 393a957d1c9303b87361dfdc3b8e504ba435ea8e Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe at google.com>
Date: Wed, 24 Jul 2024 16:55:45 -0700
Subject: [PATCH 080/259] [bazel] Add missing dependency after
 74a1ca504bf60f02431140ee72dbe1c158556237

---
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index eab5e28511c2c..5294351190d19 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -675,6 +675,7 @@ libc_support_library(
     deps = [
         ":__support_common",
         ":__support_cpp_type_traits",
+        ":__support_fputil_dyadic_float",
         ":__support_fputil_fenv_impl",
         ":__support_fputil_fp_bits",
         ":__support_macros_optimization",
@@ -1089,7 +1090,7 @@ libc_support_library(
         ":__support_macros_optimization",
         ":__support_osutil_syscall",
         ":types_pid_t",
-    ]
+    ],
 )
 
 libc_support_library(

>From 0af754213507972a0d0301bc195d65414d8dc193 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin at gmail.com>
Date: Tue, 23 Jul 2024 16:06:00 -0700
Subject: [PATCH 081/259] Reapply "[WebAssembly] Fix phi handling for Wasm SjLj
 (#99730)"

This reapplies #99730. #99730 contained a nondeterministic iteration
which failed the reverse-iteration bot
(https://lab.llvm.org/buildbot/#/builders/110/builds/474) and reverted
in
https://github.com/llvm/llvm-project/commit/f3f0d9928f982cfd302351f418bcc5b63cc1bb9d.

The fix is make the order of iteration of new predecessors
determintistic by using `SmallSetVector`.
```diff
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -1689,7 +1689,7 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj(
     }
   }

-  SmallDenseMap<BasicBlock *, SmallPtrSet<BasicBlock *, 4>, 4>
+  SmallDenseMap<BasicBlock *, SmallSetVector<BasicBlock *, 4>, 4>
       UnwindDestToNewPreds;
   for (auto *CI : LongjmpableCalls) {
     // Even if the callee function has attribute 'nounwind', which is true for
```
---
 .../WebAssemblyLowerEmscriptenEHSjLj.cpp      |  50 +++++++
 .../WebAssembly/lower-wasm-ehsjlj-phi.ll      | 126 ++++++++++++++++++
 2 files changed, 176 insertions(+)
 create mode 100644 llvm/test/CodeGen/WebAssembly/lower-wasm-ehsjlj-phi.ll

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 7cc030460e30f..17bec8e2a6a45 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -777,6 +777,8 @@ void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) {
   SSAUpdaterBulk SSA;
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
+      if (I.getType()->isVoidTy())
+        continue;
       unsigned VarID = SSA.AddVariable(I.getName(), I.getType());
       // If a value is defined by an invoke instruction, it is only available in
       // its normal destination and not in its unwind destination.
@@ -1687,6 +1689,8 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj(
     }
   }
 
+  SmallDenseMap<BasicBlock *, SmallSetVector<BasicBlock *, 4>, 4>
+      UnwindDestToNewPreds;
   for (auto *CI : LongjmpableCalls) {
     // Even if the callee function has attribute 'nounwind', which is true for
     // all C functions, it can longjmp, which means it can throw a Wasm
@@ -1724,6 +1728,11 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj(
     }
     if (!UnwindDest)
       UnwindDest = CatchDispatchLongjmpBB;
+    // Because we are changing a longjmpable call to an invoke, its unwind
+    // destination can be an existing EH pad that already have phis, and the BB
+    // with the newly created invoke will become a new predecessor of that EH
+    // pad. In this case we need to add the new predecessor to those phis.
+    UnwindDestToNewPreds[UnwindDest].insert(CI->getParent());
     changeToInvokeAndSplitBasicBlock(CI, UnwindDest);
   }
 
@@ -1752,4 +1761,45 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj(
 
   for (Instruction *I : ToErase)
     I->eraseFromParent();
+
+  // Add entries for new predecessors to phis in unwind destinations. We use
+  // 'undef' as a placeholder value. We should make sure the phis have a valid
+  // set of predecessors before running SSAUpdater, because SSAUpdater
+  // internally can use existing phis to gather predecessor info rather than
+  // scanning the actual CFG (See FindPredecessorBlocks in SSAUpdater.cpp for
+  // details).
+  for (auto &[UnwindDest, NewPreds] : UnwindDestToNewPreds) {
+    for (PHINode &PN : UnwindDest->phis()) {
+      for (auto *NewPred : NewPreds) {
+        assert(PN.getBasicBlockIndex(NewPred) == -1);
+        PN.addIncoming(UndefValue::get(PN.getType()), NewPred);
+      }
+    }
+  }
+
+  // For unwind destinations for newly added invokes to longjmpable functions,
+  // calculate incoming values for the newly added predecessors using
+  // SSAUpdater. We add existing values in the phis to SSAUpdater as available
+  // values and let it calculate what the value should be at the end of new
+  // incoming blocks.
+  for (auto &[UnwindDest, NewPreds] : UnwindDestToNewPreds) {
+    for (PHINode &PN : UnwindDest->phis()) {
+      SSAUpdater SSA;
+      SSA.Initialize(PN.getType(), PN.getName());
+      for (unsigned Idx = 0, E = PN.getNumIncomingValues(); Idx != E; ++Idx) {
+        if (NewPreds.contains(PN.getIncomingBlock(Idx)))
+          continue;
+        Value *V = PN.getIncomingValue(Idx);
+        if (auto *II = dyn_cast<InvokeInst>(V))
+          SSA.AddAvailableValue(II->getNormalDest(), II);
+        else if (auto *I = dyn_cast<Instruction>(V))
+          SSA.AddAvailableValue(I->getParent(), I);
+        else
+          SSA.AddAvailableValue(PN.getIncomingBlock(Idx), V);
+      }
+      for (auto *NewPred : NewPreds)
+        PN.setIncomingValueForBlock(NewPred, SSA.GetValueAtEndOfBlock(NewPred));
+      assert(PN.isComplete());
+    }
+  }
 }
diff --git a/llvm/test/CodeGen/WebAssembly/lower-wasm-ehsjlj-phi.ll b/llvm/test/CodeGen/WebAssembly/lower-wasm-ehsjlj-phi.ll
new file mode 100644
index 0000000000000..97f6d797a63bd
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/lower-wasm-ehsjlj-phi.ll
@@ -0,0 +1,126 @@
+; RUN: opt < %s -wasm-lower-em-ehsjlj -wasm-enable-eh -wasm-enable-sjlj -S | FileCheck %s
+
+target triple = "wasm32-unknown-emscripten"
+
+%struct.__jmp_buf_tag = type { [6 x i32], i32, [32 x i32] }
+ at buf = internal global [1 x %struct.__jmp_buf_tag] zeroinitializer, align 16
+
+; When longjmpable calls are coverted into invokes in Wasm SjLj transformation
+; and their unwind destination is an existing catchpad or cleanuppad due to
+; maintain the scope structure, the new pred BBs created by invokes and the
+; correct incoming values should be added the existing phis in those unwind
+; destinations.
+
+; When longjmpable calls are within a cleanuppad.
+define void @longjmpable_invoke_phi0() personality ptr @__gxx_wasm_personality_v0 {
+; CHECK-LABEL: @longjmpable_invoke_phi0
+entry:
+  %val.entry = call i32 @llvm.wasm.memory.size.i32(i32 0)
+  %0 = call i32 @setjmp(ptr @buf) #2
+  invoke void @foo()
+          to label %bb1 unwind label %ehcleanup1
+
+bb1:                                              ; preds = %entry
+  ; We use llvm.wasm.memory.size intrinsic just to get/use an i32 value. The
+  ; reason we use an intrinsic here is to make it not longjmpable. If this can
+  ; longjmp, the result will be more complicated and hard to check.
+  %val.bb1 = call i32 @llvm.wasm.memory.size.i32(i32 0)
+  invoke void @foo()
+          to label %bb2 unwind label %ehcleanup0
+
+bb2:                                              ; preds = %bb1
+  unreachable
+
+ehcleanup0:                                       ; preds = %bb1
+  %1 = cleanuppad within none []
+  call void @longjmpable() [ "funclet"(token %1) ]
+; CHECK:      ehcleanup0
+; CHECK:        invoke void @longjmpable
+; CHECK-NEXT:           to label %.noexc unwind label %ehcleanup1
+  invoke void @foo() [ "funclet"(token %1) ]
+          to label %bb3 unwind label %ehcleanup1
+
+bb3:                                              ; preds = %ehcleanup0
+  %val.bb3 = call i32 @llvm.wasm.memory.size.i32(i32 0)
+  call void @longjmpable() [ "funclet"(token %1) ]
+; CHECK:      bb3:
+; CHECK:        invoke void @longjmpable
+; CHECK-NEXT:           to label %.noexc1 unwind label %ehcleanup1
+  cleanupret from %1 unwind label %ehcleanup1
+
+ehcleanup1:                                       ; preds = %bb3, %ehcleanup0, %entry
+  %phi = phi i32 [ %val.entry, %entry ], [ %val.bb1, %ehcleanup0 ], [ %val.bb3, %bb3 ]
+; CHECK:      ehcleanup1:
+; CHECK-NEXT:   %phi = phi i32 [ %val.entry2, %entry.split.split ], [ %val.bb1, %.noexc ], [ %val.bb3, %.noexc1 ], [ %val.bb1, %ehcleanup0 ], [ %val.bb3, %bb3 ]
+  %2 = cleanuppad within none []
+  %3 = call i32 @llvm.wasm.memory.size.i32(i32 %phi)
+  cleanupret from %2 unwind to caller
+}
+
+; When longjmpable calls are within a catchpad.
+define void @longjmpable_invoke_phi1() personality ptr @__gxx_wasm_personality_v0 {
+; CHECK-LABEL: @longjmpable_invoke_phi1
+entry:
+  %val.entry = call i32 @llvm.wasm.memory.size.i32(i32 0)
+  %0 = call i32 @setjmp(ptr @buf) #2
+  invoke void @foo()
+          to label %bb1 unwind label %ehcleanup
+
+bb1:                                              ; preds = %entry
+  %val.bb1 = call i32 @llvm.wasm.memory.size.i32(i32 0)
+  invoke void @foo()
+          to label %bb2 unwind label %catch.dispatch
+
+bb2:                                              ; preds = %bb1
+  unreachable
+
+catch.dispatch:                                   ; preds = %bb1
+  %1 = catchswitch within none [label %catch.start] unwind label %ehcleanup
+
+catch.start:                                      ; preds = %catch.dispatch
+  %2 = catchpad within %1 [ptr null]
+  %3 = call ptr @llvm.wasm.get.exception(token %2)
+  %4 = call i32 @llvm.wasm.get.ehselector(token %2)
+  call void @longjmpable() [ "funclet"(token %2) ]
+; CHECK:      catch.start:
+; CHECK:        invoke void @longjmpable
+; CHECK-NEXT:           to label %.noexc unwind label %ehcleanup
+  invoke void @foo() [ "funclet"(token %2) ]
+          to label %bb3 unwind label %ehcleanup
+
+bb3:                                              ; preds = %catch.start
+  %val.bb3 = call i32 @llvm.wasm.memory.size.i32(i32 0)
+  call void @longjmpable() [ "funclet"(token %2) ]
+; CHECK:      bb3:
+; CHECK:        invoke void @longjmpable
+; CHECK-NEXT:           to label %.noexc1 unwind label %ehcleanup
+  invoke void @foo() [ "funclet"(token %2) ]
+          to label %bb4 unwind label %ehcleanup
+
+bb4:                                              ; preds = %bb3
+  unreachable
+
+ehcleanup:                                        ; preds = %bb3, %catch.start, %catch.dispatch, %entry
+  %phi = phi i32 [ %val.entry, %entry ], [ %val.bb1, %catch.dispatch ], [ %val.bb1, %catch.start ], [ %val.bb3, %bb3 ]
+; CHECK:      ehcleanup:
+; CHECK-NEXT:   %phi = phi i32 [ %val.entry2, %entry.split.split ], [ %val.bb1, %catch.dispatch ], [ %val.bb1, %.noexc ], [ %val.bb3, %.noexc1 ], [ %val.bb1, %catch.start ], [ %val.bb3, %bb3 ]
+  %5 = cleanuppad within none []
+  %6 = call i32 @llvm.wasm.memory.size.i32(i32 %phi)
+  cleanupret from %5 unwind to caller
+}
+
+declare i32 @setjmp(ptr)
+declare i32 @__gxx_wasm_personality_v0(...)
+declare void @foo()
+declare void @longjmpable()
+declare void @use_i32(i32)
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare i32 @llvm.wasm.get.ehselector(token) #0
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare ptr @llvm.wasm.get.exception(token) #0
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare i32 @llvm.wasm.memory.size.i32(i32) #1
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(read) }
+attributes #2 = { returns_twice }

>From 115c89b94cb8af10c2029001a1dd28154d948a37 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Wed, 24 Jul 2024 17:04:39 -0700
Subject: [PATCH 082/259] [msan] Enable and update neon_vst_float test case
 (#100435)

This enables the neon_vst_float test case
(https://github.com/llvm/llvm-project/pull/100210). Although MSan does
not yet generate useful IR, since opt is run with -disable-verify, the
test case should still run successfully.

This patch also makes minor fixes to the test case:
- 'ptr %a' is renamed to 'ptr %p' because update_test_checks.py is
case-insensitive, and was unexpectedly aliasing the %A and %a in the
expected output ('<1 x double> [[A]], <1 x double> [[B]], ptr [[A]]').
- The sample output for st1x{2,3,4} was previously accidentally
generated using a prototype version of MSan rather than trunk; these
instructions are not yet instrumented.
- Changes the comment on how the test case was generated, because '; |
sed -r 's/^\/\/ CHECK:[ ]*//'' was being interpreted by FileCheck to be
a CHECK: command.
---
 .../MemorySanitizer/AArch64/neon_vst_float.ll | 764 ++++++++++--------
 1 file changed, 411 insertions(+), 353 deletions(-)

diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll
index e9881930b700a..2ac676f3559b4 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll
@@ -5,12 +5,10 @@
 ;
 ; RUN: opt < %s -passes=msan -S -disable-verify | FileCheck %s
 ;
-; UNSUPPORTED: {{.*}}
-;
 ; Generated with:
 ;     grep call clang/test/CodeGen/aarch64-neon-intrinsics.c \
 ;         |  grep 'neon[.]st'                                \
-;         | sed -r 's/^\/\/ CHECK:[ ]*//'                    \
+;         | sed -r 's/^\/\/ CHECK[:][ ]*//'                  \
 ;         | cut -d ' ' -f 1 --complement                     \
 ;         | sed -r 's/[[][[]TMP[0-9]+[]][]]/%A/'             \
 ;         | sed -r 's/[[][[]TMP[0-9]+[]][]]/%B/'             \
@@ -36,318 +34,374 @@ target triple = "aarch64--linux-android9001"
 
 ; -----------------------------------------------------------------------------------------------------------------------------------------------
 
-define void @st1x2_v1f64(<1 x double> %A, <1 x double> %B, ptr %a) sanitize_memory {
+define void @st1x2_v1f64(<1 x double> %A, <1 x double> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x2_v1f64(
-; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0:![0-9]+]]
-; CHECK:       7:
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], ptr [[A]])
-;
-; EDITOR'S NOTE: the next call is invalid because the parameters (shadows) are integer, but the called function
-;                expects floating-point parameters.
-;
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1f64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st1x2.v1f64.p0(<1 x double> %A, <1 x double> %B, ptr %a)
+  call void @llvm.aarch64.neon.st1x2.v1f64.p0(<1 x double> %A, <1 x double> %B, ptr %p)
   ret void
 }
 
-define void @st1x2_v1i64(<1 x i64> %A, <1 x i64> %B, ptr %a) sanitize_memory {
+define void @st1x2_v1i64(<1 x i64> %A, <1 x i64> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x2_v1i64(
-; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
-; CHECK:       7:
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], ptr [[A]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> %A, <1 x i64> %B, ptr %a)
+  call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> %A, <1 x i64> %B, ptr %p)
   ret void
 }
 
-define void @st1x2_v2f64(<2 x double> %A, <2 x double> %B, ptr %a) sanitize_memory {
+define void @st1x2_v2f64(<2 x double> %A, <2 x double> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x2_v2f64(
-; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP1]]
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
-; CHECK:       7:
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], ptr [[A]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2f64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st1x2.v2f64.p0(<2 x double> %A, <2 x double> %B, ptr %a)
+  call void @llvm.aarch64.neon.st1x2.v2f64.p0(<2 x double> %A, <2 x double> %B, ptr %p)
   ret void
 }
 
-define void @st1x2_v2i64(<2 x i64> %A, <2 x i64> %B, ptr %a) sanitize_memory {
+define void @st1x2_v2i64(<2 x i64> %A, <2 x i64> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x2_v2i64(
-; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP1]]
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
-; CHECK:       7:
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], ptr [[A]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> %A, <2 x i64> %B, ptr %a)
+  call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> %A, <2 x i64> %B, ptr %p)
   ret void
 }
 
-define void @st1x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %a) sanitize_memory {
+define void @st1x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x3_v1f64(
-; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], <1 x double> [[C:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], <1 x double> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], <1 x double> [[C]], ptr [[A]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1f64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], <1 x double> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st1x3.v1f64.p0(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %a)
+  call void @llvm.aarch64.neon.st1x3.v1f64.p0(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %p)
   ret void
 }
 
-define void @st1x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %a) sanitize_memory {
+define void @st1x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x3_v1i64(
-; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], ptr [[A]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %a)
+  call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %p)
   ret void
 }
 
-define void @st1x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %a) sanitize_memory {
+define void @st1x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x3_v2f64(
-; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]], ptr [[A]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2f64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st1x3.v2f64.p0(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %a)
+  call void @llvm.aarch64.neon.st1x3.v2f64.p0(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %p)
   ret void
 }
 
-define void @st1x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %a) sanitize_memory {
+define void @st1x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x3_v2i64(
-; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], ptr [[A]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %a)
+  call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %p)
   ret void
 }
 
-define void @st1x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %a) sanitize_memory {
+define void @st1x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x4_v1f64(
-; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], <1 x double> [[C:%.*]], <1 x double> [[D:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], <1 x double> [[C:%.*]], <1 x double> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], <1 x double> [[C]], <1 x double> [[D]], ptr [[A]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1f64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], <1 x double> [[C]], <1 x double> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st1x4.v1f64.p0(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %a)
+  call void @llvm.aarch64.neon.st1x4.v1f64.p0(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %p)
   ret void
 }
 
-define void @st1x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %a) sanitize_memory {
+define void @st1x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x4_v1i64(
-; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], <1 x i64> [[D:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], <1 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], <1 x i64> [[D]], ptr [[A]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], <1 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %a)
+  call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %p)
   ret void
 }
 
-define void @st1x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %a) sanitize_memory {
+define void @st1x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x4_v2f64(
-; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], <2 x double> [[D:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], <2 x double> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
-; CHECK:       9:
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]], <2 x double> [[D]], ptr [[A]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2f64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]], <2 x double> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st1x4.v2f64.p0(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %a)
+  call void @llvm.aarch64.neon.st1x4.v2f64.p0(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %p)
   ret void
 }
 
-define void @st1x4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %a) sanitize_memory {
+define void @st1x4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x4_v2i64(
-; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
-; CHECK:       9:
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], ptr [[A]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %a)
+  call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %p)
   ret void
 }
 
-define void @st2_v16i8(<16 x i8> %A, <16 x i8> %B, ptr %a) sanitize_memory {
+define void @st2_v16i8(<16 x i8> %A, <16 x i8> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v16i8(
-; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -356,22 +410,22 @@ define void @st2_v16i8(<16 x i8> %A, <16 x i8> %B, ptr %a) sanitize_memory {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> %A, <16 x i8> %B, ptr %a)
+  call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> %A, <16 x i8> %B, ptr %p)
   ret void
 }
 
-define void @st2_v1f64(<1 x double> %A, <1 x double> %B, ptr %a) sanitize_memory {
+define void @st2_v1f64(<1 x double> %A, <1 x double> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v1f64(
-; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -380,22 +434,26 @@ define void @st2_v1f64(<1 x double> %A, <1 x double> %B, ptr %a) sanitize_memory
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], ptr [[P]])
+;
+; EDITOR'S NOTE: the next call is invalid because the parameters (shadows) are integer, but the called function
+;                expects floating-point parameters.
+;
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1f64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st2.v1f64.p0(<1 x double> %A, <1 x double> %B, ptr %a)
+  call void @llvm.aarch64.neon.st2.v1f64.p0(<1 x double> %A, <1 x double> %B, ptr %p)
   ret void
 }
 
-define void @st2_v1i64(<1 x i64> %A, <1 x i64> %B, ptr %a) sanitize_memory {
+define void @st2_v1i64(<1 x i64> %A, <1 x i64> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v1i64(
-; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -404,22 +462,22 @@ define void @st2_v1i64(<1 x i64> %A, <1 x i64> %B, ptr %a) sanitize_memory {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> %A, <1 x i64> %B, ptr %a)
+  call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> %A, <1 x i64> %B, ptr %p)
   ret void
 }
 
-define void @st2_v2f32(<2 x float> %A, <2 x float> %B, ptr %a) sanitize_memory {
+define void @st2_v2f32(<2 x float> %A, <2 x float> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v2f32(
-; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -428,22 +486,22 @@ define void @st2_v2f32(<2 x float> %A, <2 x float> %B, ptr %a) sanitize_memory {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2f32.p0(<2 x float> [[A]], <2 x float> [[B]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2f32.p0(<2 x float> [[A]], <2 x float> [[B]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2f32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st2.v2f32.p0(<2 x float> %A, <2 x float> %B, ptr %a)
+  call void @llvm.aarch64.neon.st2.v2f32.p0(<2 x float> %A, <2 x float> %B, ptr %p)
   ret void
 }
 
-define void @st2_v2f64(<2 x double> %A, <2 x double> %B, ptr %a) sanitize_memory {
+define void @st2_v2f64(<2 x double> %A, <2 x double> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v2f64(
-; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -452,22 +510,22 @@ define void @st2_v2f64(<2 x double> %A, <2 x double> %B, ptr %a) sanitize_memory
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> %A, <2 x double> %B, ptr %a)
+  call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> %A, <2 x double> %B, ptr %p)
   ret void
 }
 
-define void @st2_v2i32(<2 x i32> %A, <2 x i32> %B, ptr %a) sanitize_memory {
+define void @st2_v2i32(<2 x i32> %A, <2 x i32> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v2i32(
-; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -476,22 +534,22 @@ define void @st2_v2i32(<2 x i32> %A, <2 x i32> %B, ptr %a) sanitize_memory {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> %A, <2 x i32> %B, ptr %a)
+  call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> %A, <2 x i32> %B, ptr %p)
   ret void
 }
 
-define void @st2_v2i64(<2 x i64> %A, <2 x i64> %B, ptr %a) sanitize_memory {
+define void @st2_v2i64(<2 x i64> %A, <2 x i64> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v2i64(
-; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -500,22 +558,22 @@ define void @st2_v2i64(<2 x i64> %A, <2 x i64> %B, ptr %a) sanitize_memory {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> %A, <2 x i64> %B, ptr %a)
+  call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> %A, <2 x i64> %B, ptr %p)
   ret void
 }
 
-define void @st2_v4f16(<4 x half> %A, <4 x half> %B, ptr %a) sanitize_memory {
+define void @st2_v4f16(<4 x half> %A, <4 x half> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v4f16(
-; CHECK-SAME: <4 x half> [[A:%.*]], <4 x half> [[B:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x half> [[A:%.*]], <4 x half> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -524,22 +582,22 @@ define void @st2_v4f16(<4 x half> %A, <4 x half> %B, ptr %a) sanitize_memory {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4f16.p0(<4 x half> [[A]], <4 x half> [[B]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4f16.p0(<4 x half> [[A]], <4 x half> [[B]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4f16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st2.v4f16.p0(<4 x half> %A, <4 x half> %B, ptr %a)
+  call void @llvm.aarch64.neon.st2.v4f16.p0(<4 x half> %A, <4 x half> %B, ptr %p)
   ret void
 }
 
-define void @st2_v4f32(<4 x float> %A, <4 x float> %B, ptr %a) sanitize_memory {
+define void @st2_v4f32(<4 x float> %A, <4 x float> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v4f32(
-; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -548,22 +606,22 @@ define void @st2_v4f32(<4 x float> %A, <4 x float> %B, ptr %a) sanitize_memory {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> [[A]], <4 x float> [[B]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> [[A]], <4 x float> [[B]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> %A, <4 x float> %B, ptr %a)
+  call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> %A, <4 x float> %B, ptr %p)
   ret void
 }
 
-define void @st2_v4i16(<4 x i16> %A, <4 x i16> %B, ptr %a) sanitize_memory {
+define void @st2_v4i16(<4 x i16> %A, <4 x i16> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v4i16(
-; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -572,22 +630,22 @@ define void @st2_v4i16(<4 x i16> %A, <4 x i16> %B, ptr %a) sanitize_memory {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> %A, <4 x i16> %B, ptr %a)
+  call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> %A, <4 x i16> %B, ptr %p)
   ret void
 }
 
-define void @st2_v4i32(<4 x i32> %A, <4 x i32> %B, ptr %a) sanitize_memory {
+define void @st2_v4i32(<4 x i32> %A, <4 x i32> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v4i32(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -596,22 +654,22 @@ define void @st2_v4i32(<4 x i32> %A, <4 x i32> %B, ptr %a) sanitize_memory {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %A, <4 x i32> %B, ptr %a)
+  call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %A, <4 x i32> %B, ptr %p)
   ret void
 }
 
-define void @st2_v8f16(<8 x half> %A, <8 x half> %B, ptr %a) sanitize_memory {
+define void @st2_v8f16(<8 x half> %A, <8 x half> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v8f16(
-; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -620,22 +678,22 @@ define void @st2_v8f16(<8 x half> %A, <8 x half> %B, ptr %a) sanitize_memory {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8f16.p0(<8 x half> [[A]], <8 x half> [[B]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8f16.p0(<8 x half> [[A]], <8 x half> [[B]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8f16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st2.v8f16.p0(<8 x half> %A, <8 x half> %B, ptr %a)
+  call void @llvm.aarch64.neon.st2.v8f16.p0(<8 x half> %A, <8 x half> %B, ptr %p)
   ret void
 }
 
-define void @st2_v8i16(<8 x i16> %A, <8 x i16> %B, ptr %a) sanitize_memory {
+define void @st2_v8i16(<8 x i16> %A, <8 x i16> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v8i16(
-; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -644,22 +702,22 @@ define void @st2_v8i16(<8 x i16> %A, <8 x i16> %B, ptr %a) sanitize_memory {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> %A, <8 x i16> %B, ptr %a)
+  call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> %A, <8 x i16> %B, ptr %p)
   ret void
 }
 
-define void @st2_v8i8(<8 x i8> %A, <8 x i8> %B, ptr %a) sanitize_memory {
+define void @st2_v8i8(<8 x i8> %A, <8 x i8> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v8i8(
-; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -668,23 +726,23 @@ define void @st2_v8i8(<8 x i8> %A, <8 x i8> %B, ptr %a) sanitize_memory {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> %A, <8 x i8> %B, ptr %a)
+  call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> %A, <8 x i8> %B, ptr %p)
   ret void
 }
 
-define void @st3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %a) sanitize_memory {
+define void @st3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v16i8(
-; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -693,23 +751,23 @@ define void @st3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %a) sanitiz
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %a)
+  call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %p)
   ret void
 }
 
-define void @st3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %a) sanitize_memory {
+define void @st3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v1f64(
-; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], <1 x double> [[C:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], <1 x double> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -718,23 +776,23 @@ define void @st3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %a
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], <1 x double> [[C]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], <1 x double> [[C]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1f64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st3.v1f64.p0(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %a)
+  call void @llvm.aarch64.neon.st3.v1f64.p0(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %p)
   ret void
 }
 
-define void @st3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %a) sanitize_memory {
+define void @st3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v1i64(
-; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -743,23 +801,23 @@ define void @st3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %a) sanitiz
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %a)
+  call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %p)
   ret void
 }
 
-define void @st3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, ptr %a) sanitize_memory {
+define void @st3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v2f32(
-; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -768,23 +826,23 @@ define void @st3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, ptr %a) s
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2f32.p0(<2 x float> [[A]], <2 x float> [[B]], <2 x float> [[C]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2f32.p0(<2 x float> [[A]], <2 x float> [[B]], <2 x float> [[C]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2f32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st3.v2f32.p0(<2 x float> %A, <2 x float> %B, <2 x float> %C, ptr %a)
+  call void @llvm.aarch64.neon.st3.v2f32.p0(<2 x float> %A, <2 x float> %B, <2 x float> %C, ptr %p)
   ret void
 }
 
-define void @st3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %a) sanitize_memory {
+define void @st3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v2f64(
-; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -793,23 +851,23 @@ define void @st3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %a
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2f64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st3.v2f64.p0(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %a)
+  call void @llvm.aarch64.neon.st3.v2f64.p0(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %p)
   ret void
 }
 
-define void @st3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %a) sanitize_memory {
+define void @st3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v2i32(
-; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -818,23 +876,23 @@ define void @st3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %a) sanitiz
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> [[C]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> [[C]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %a)
+  call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %p)
   ret void
 }
 
-define void @st3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %a) sanitize_memory {
+define void @st3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v2i64(
-; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -843,23 +901,23 @@ define void @st3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %a) sanitiz
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %a)
+  call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %p)
   ret void
 }
 
-define void @st3_v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr %a) sanitize_memory {
+define void @st3_v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v4f16(
-; CHECK-SAME: <4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -868,23 +926,23 @@ define void @st3_v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr %a) sani
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4f16.p0(<4 x half> [[A]], <4 x half> [[B]], <4 x half> [[C]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4f16.p0(<4 x half> [[A]], <4 x half> [[B]], <4 x half> [[C]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4f16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st3.v4f16.p0(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr %a)
+  call void @llvm.aarch64.neon.st3.v4f16.p0(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr %p)
   ret void
 }
 
-define void @st3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, ptr %a) sanitize_memory {
+define void @st3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v4f32(
-; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -893,23 +951,23 @@ define void @st3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, ptr %a) s
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[C]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[C]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> %A, <4 x float> %B, <4 x float> %C, ptr %a)
+  call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> %A, <4 x float> %B, <4 x float> %C, ptr %p)
   ret void
 }
 
-define void @st3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %a) sanitize_memory {
+define void @st3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v4i16(
-; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -918,23 +976,23 @@ define void @st3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %a) sanitiz
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %a)
+  call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %p)
   ret void
 }
 
-define void @st3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %a) sanitize_memory {
+define void @st3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v4i32(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -943,23 +1001,23 @@ define void @st3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %a) sanitiz
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %a)
+  call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %p)
   ret void
 }
 
-define void @st3_v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr %a) sanitize_memory {
+define void @st3_v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v8f16(
-; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -968,23 +1026,23 @@ define void @st3_v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr %a) sani
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8f16.p0(<8 x half> [[A]], <8 x half> [[B]], <8 x half> [[C]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8f16.p0(<8 x half> [[A]], <8 x half> [[B]], <8 x half> [[C]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8f16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st3.v8f16.p0(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr %a)
+  call void @llvm.aarch64.neon.st3.v8f16.p0(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr %p)
   ret void
 }
 
-define void @st3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %a) sanitize_memory {
+define void @st3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v8i16(
-; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -993,23 +1051,23 @@ define void @st3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %a) sanitiz
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %a)
+  call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %p)
   ret void
 }
 
-define void @st3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %a) sanitize_memory {
+define void @st3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v8i8(
-; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1018,24 +1076,24 @@ define void @st3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %a) sanitize_me
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %a)
+  call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %p)
   ret void
 }
 
-define void @st4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %a) sanitize_memory {
+define void @st4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v16i8(
-; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1044,24 +1102,24 @@ define void @st4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, p
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %a)
+  call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %p)
   ret void
 }
 
-define void @st4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %a) sanitize_memory {
+define void @st4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v1f64(
-; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], <1 x double> [[C:%.*]], <1 x double> [[D:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], <1 x double> [[C:%.*]], <1 x double> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1070,24 +1128,24 @@ define void @st4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x d
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], <1 x double> [[C]], <1 x double> [[D]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], <1 x double> [[C]], <1 x double> [[D]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1f64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st4.v1f64.p0(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %a)
+  call void @llvm.aarch64.neon.st4.v1f64.p0(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %p)
   ret void
 }
 
-define void @st4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %a) sanitize_memory {
+define void @st4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v1i64(
-; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], <1 x i64> [[D:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], <1 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1096,24 +1154,24 @@ define void @st4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, p
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], <1 x i64> [[D]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], <1 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %a)
+  call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %p)
   ret void
 }
 
-define void @st4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, ptr %a) sanitize_memory {
+define void @st4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v2f32(
-; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x float> [[D:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x float> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1122,24 +1180,24 @@ define void @st4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x floa
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2f32.p0(<2 x float> [[A]], <2 x float> [[B]], <2 x float> [[C]], <2 x float> [[D]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2f32.p0(<2 x float> [[A]], <2 x float> [[B]], <2 x float> [[C]], <2 x float> [[D]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2f32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st4.v2f32.p0(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, ptr %a)
+  call void @llvm.aarch64.neon.st4.v2f32.p0(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, ptr %p)
   ret void
 }
 
-define void @st4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %a) sanitize_memory {
+define void @st4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v2f64(
-; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], <2 x double> [[D:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], <2 x double> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1148,24 +1206,24 @@ define void @st4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x d
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]], <2 x double> [[D]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]], <2 x double> [[D]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2f64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st4.v2f64.p0(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %a)
+  call void @llvm.aarch64.neon.st4.v2f64.p0(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %p)
   ret void
 }
 
-define void @st4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr %a) sanitize_memory {
+define void @st4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v2i32(
-; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], <2 x i32> [[D:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], <2 x i32> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1174,24 +1232,24 @@ define void @st4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, p
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> [[C]], <2 x i32> [[D]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> [[C]], <2 x i32> [[D]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr %a)
+  call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr %p)
   ret void
 }
 
-define void @st4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %a) sanitize_memory {
+define void @st4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v2i64(
-; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1200,24 +1258,24 @@ define void @st4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, p
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %a)
+  call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %p)
   ret void
 }
 
-define void @st4_v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> %D, ptr %a) sanitize_memory {
+define void @st4_v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v4f16(
-; CHECK-SAME: <4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]], <4 x half> [[D:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]], <4 x half> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1226,24 +1284,24 @@ define void @st4_v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> %
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4f16.p0(<4 x half> [[A]], <4 x half> [[B]], <4 x half> [[C]], <4 x half> [[D]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4f16.p0(<4 x half> [[A]], <4 x half> [[B]], <4 x half> [[C]], <4 x half> [[D]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4f16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st4.v4f16.p0(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> %D, ptr %a)
+  call void @llvm.aarch64.neon.st4.v4f16.p0(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> %D, ptr %p)
   ret void
 }
 
-define void @st4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, ptr %a) sanitize_memory {
+define void @st4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v4f32(
-; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], <4 x float> [[D:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], <4 x float> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1252,24 +1310,24 @@ define void @st4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x floa
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4f32.p0(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[C]], <4 x float> [[D]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4f32.p0(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[C]], <4 x float> [[D]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4f32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st4.v4f32.p0(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, ptr %a)
+  call void @llvm.aarch64.neon.st4.v4f32.p0(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, ptr %p)
   ret void
 }
 
-define void @st4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr %a) sanitize_memory {
+define void @st4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v4i16(
-; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], <4 x i16> [[D:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], <4 x i16> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1278,24 +1336,24 @@ define void @st4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, p
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], <4 x i16> [[D]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], <4 x i16> [[D]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr %a)
+  call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr %p)
   ret void
 }
 
-define void @st4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr %a) sanitize_memory {
+define void @st4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v4i32(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i32> [[D:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i32> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1304,24 +1362,24 @@ define void @st4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, p
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], <4 x i32> [[D]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], <4 x i32> [[D]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr %a)
+  call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr %p)
   ret void
 }
 
-define void @st4_v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> %D, ptr %a) sanitize_memory {
+define void @st4_v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v8f16(
-; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]], <8 x half> [[D:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]], <8 x half> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1330,24 +1388,24 @@ define void @st4_v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> %
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8f16.p0(<8 x half> [[A]], <8 x half> [[B]], <8 x half> [[C]], <8 x half> [[D]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8f16.p0(<8 x half> [[A]], <8 x half> [[B]], <8 x half> [[C]], <8 x half> [[D]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8f16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st4.v8f16.p0(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> %D, ptr %a)
+  call void @llvm.aarch64.neon.st4.v8f16.p0(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> %D, ptr %p)
   ret void
 }
 
-define void @st4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr %a) sanitize_memory {
+define void @st4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v8i16(
-; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i16> [[D:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i16> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1356,24 +1414,24 @@ define void @st4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, p
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], <8 x i16> [[D]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], <8 x i16> [[D]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr %a)
+  call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr %p)
   ret void
 }
 
-define void @st4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %a) sanitize_memory {
+define void @st4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v8i8(
-; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1382,11 +1440,11 @@ define void @st4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %a
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], <8 x i8> [[D]], ptr [[A]])
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %a)
+  call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %p)
   ret void
 }
 ;.

>From e894df6392beea3723627329009f3e6d51d16f47 Mon Sep 17 00:00:00 2001
From: Dmitry Chestnykh <dm.chestnykh at gmail.com>
Date: Thu, 25 Jul 2024 03:06:32 +0300
Subject: [PATCH 083/259] [compiler-rt] Implement `DumpAllRegisters` for
 arm-linux and aarch64-linux (#100398)

Reland with a couple of build fixes

#100337 #100342 #99613
---
 .../lib/sanitizer_common/sanitizer_linux.cpp  | 154 +++++++++++++++++-
 .../Linux/dump_registers_aarch64.cpp          |  23 +++
 .../TestCases/Linux/dump_registers_arm.cpp    |  19 +++
 3 files changed, 191 insertions(+), 5 deletions(-)
 create mode 100644 compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_aarch64.cpp
 create mode 100644 compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_arm.cpp

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index 483a1042a6238..2ea61b1cb424c 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -2172,15 +2172,125 @@ static const char *RegNumToRegName(int reg) {
       return "ebp";
     case REG_ESP:
       return "esp";
+#    elif defined(__arm__)
+#      ifdef MAKE_CASE
+#        undef MAKE_CASE
+#      endif
+#      define REG_STR(reg) #reg
+#      define MAKE_CASE(N) \
+        case REG_R##N:     \
+          return REG_STR(r##N)
+    MAKE_CASE(0);
+    MAKE_CASE(1);
+    MAKE_CASE(2);
+    MAKE_CASE(3);
+    MAKE_CASE(4);
+    MAKE_CASE(5);
+    MAKE_CASE(6);
+    MAKE_CASE(7);
+    MAKE_CASE(8);
+    MAKE_CASE(9);
+    MAKE_CASE(10);
+    MAKE_CASE(11);
+    MAKE_CASE(12);
+    case REG_R13:
+      return "sp";
+    case REG_R14:
+      return "lr";
+    case REG_R15:
+      return "pc";
+#    elif defined(__aarch64__)
+#      define REG_STR(reg) #reg
+#      define MAKE_CASE(N) \
+        case N:            \
+          return REG_STR(x##N)
+    MAKE_CASE(0);
+    MAKE_CASE(1);
+    MAKE_CASE(2);
+    MAKE_CASE(3);
+    MAKE_CASE(4);
+    MAKE_CASE(5);
+    MAKE_CASE(6);
+    MAKE_CASE(7);
+    MAKE_CASE(8);
+    MAKE_CASE(9);
+    MAKE_CASE(10);
+    MAKE_CASE(11);
+    MAKE_CASE(12);
+    MAKE_CASE(13);
+    MAKE_CASE(14);
+    MAKE_CASE(15);
+    MAKE_CASE(16);
+    MAKE_CASE(17);
+    MAKE_CASE(18);
+    MAKE_CASE(19);
+    MAKE_CASE(20);
+    MAKE_CASE(21);
+    MAKE_CASE(22);
+    MAKE_CASE(23);
+    MAKE_CASE(24);
+    MAKE_CASE(25);
+    MAKE_CASE(26);
+    MAKE_CASE(27);
+    MAKE_CASE(28);
+    case 29:
+      return "fp";
+    case 30:
+      return "lr";
+    case 31:
+      return "sp";
 #    endif
-#  endif
+#  endif  // SANITIZER_LINUX
     default:
       return NULL;
   }
   return NULL;
 }
 
-#  if SANITIZER_LINUX
+#  if SANITIZER_LINUX && (defined(__arm__) || defined(__aarch64__))
+static uptr GetArmRegister(ucontext_t *ctx, int RegNum) {
+  switch (RegNum) {
+#    if defined(__arm__)
+#      ifdef MAKE_CASE
+#        undef MAKE_CASE
+#      endif
+#      define MAKE_CASE(N) \
+        case REG_R##N:     \
+          return ctx->uc_mcontext.arm_r##N
+    MAKE_CASE(0);
+    MAKE_CASE(1);
+    MAKE_CASE(2);
+    MAKE_CASE(3);
+    MAKE_CASE(4);
+    MAKE_CASE(5);
+    MAKE_CASE(6);
+    MAKE_CASE(7);
+    MAKE_CASE(8);
+    MAKE_CASE(9);
+    MAKE_CASE(10);
+    case REG_R11:
+      return ctx->uc_mcontext.arm_fp;
+    case REG_R12:
+      return ctx->uc_mcontext.arm_ip;
+    case REG_R13:
+      return ctx->uc_mcontext.arm_sp;
+    case REG_R14:
+      return ctx->uc_mcontext.arm_lr;
+    case REG_R15:
+      return ctx->uc_mcontext.arm_pc;
+#    elif defined(__aarch64__)
+    case 0 ... 30:
+      return ctx->uc_mcontext.regs[RegNum];
+    case 31:
+      return ctx->uc_mcontext.sp;
+#    endif
+    default:
+      return 0;
+  }
+  return 0;
+}
+#  endif  // SANITIZER_LINUX && (defined(__arm__) || defined(__aarch64__))
+
 UNUSED
 static void DumpSingleReg(ucontext_t *ctx, int RegNum) {
   const char *RegName = RegNumToRegName(RegNum);
@@ -2189,11 +2299,16 @@ static void DumpSingleReg(ucontext_t *ctx, int RegNum) {
          RegName, ctx->uc_mcontext.gregs[RegNum]);
 #    elif defined(__i386__)
   Printf("%s = 0x%08x  ", RegName, ctx->uc_mcontext.gregs[RegNum]);
-#    else
+#  elif defined(__arm__)
+  Printf("%s%s = 0x%08zx  ", internal_strlen(RegName) == 2 ? " " : "", RegName,
+         GetArmRegister(ctx, RegNum));
+#  elif defined(__aarch64__)
+  Printf("%s%s = 0x%016zx  ", internal_strlen(RegName) == 2 ? " " : "", RegName,
+         GetArmRegister(ctx, RegNum));
+#  else
   (void)RegName;
-#    endif
-}
 #  endif
+}
 
 void SignalContext::DumpAllRegisters(void *context) {
   ucontext_t *ucontext = (ucontext_t *)context;
@@ -2236,6 +2351,35 @@ void SignalContext::DumpAllRegisters(void *context) {
   DumpSingleReg(ucontext, REG_EBP);
   DumpSingleReg(ucontext, REG_ESP);
   Printf("\n");
+#    elif defined(__arm__)
+  Report("Register values:\n");
+  DumpSingleReg(ucontext, REG_R0);
+  DumpSingleReg(ucontext, REG_R1);
+  DumpSingleReg(ucontext, REG_R2);
+  DumpSingleReg(ucontext, REG_R3);
+  Printf("\n");
+  DumpSingleReg(ucontext, REG_R4);
+  DumpSingleReg(ucontext, REG_R5);
+  DumpSingleReg(ucontext, REG_R6);
+  DumpSingleReg(ucontext, REG_R7);
+  Printf("\n");
+  DumpSingleReg(ucontext, REG_R8);
+  DumpSingleReg(ucontext, REG_R9);
+  DumpSingleReg(ucontext, REG_R10);
+  DumpSingleReg(ucontext, REG_R11);
+  Printf("\n");
+  DumpSingleReg(ucontext, REG_R12);
+  DumpSingleReg(ucontext, REG_R13);
+  DumpSingleReg(ucontext, REG_R14);
+  DumpSingleReg(ucontext, REG_R15);
+  Printf("\n");
+#    elif defined(__aarch64__)
+  Report("Register values:\n");
+  for (int i = 0; i <= 31; ++i) {
+    DumpSingleReg(ucontext, i);
+    if (i % 4 == 3)
+      Printf("\n");
+  }
 #    else
   (void)ucontext;
 #    endif
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_aarch64.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_aarch64.cpp
new file mode 100644
index 0000000000000..e01b826c86b8a
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_aarch64.cpp
@@ -0,0 +1,23 @@
+// Check that sanitizer prints registers dump_registers on dump_registers=1
+// RUN: %clangxx  %s -o %t
+// RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-NODUMP
+// RUN: not %run %t 2>&1 | FileCheck %s --strict-whitespace --check-prefix=CHECK-DUMP
+//
+// REQUIRES: aarch64-target-arch
+
+#include <signal.h>
+
+int main() {
+  raise(SIGSEGV);
+  // CHECK-DUMP: Register values
+  // CHECK-DUMP-NEXT: x0 = {{0x[0-9a-f]+}}   x1 = {{0x[0-9a-f]+}}   x2 = {{0x[0-9a-f]+}}   x3 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: x4 = {{0x[0-9a-f]+}}   x5 = {{0x[0-9a-f]+}}   x6 = {{0x[0-9a-f]+}}   x7 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: x8 = {{0x[0-9a-f]+}}   x9 = {{0x[0-9a-f]+}}  x10 = {{0x[0-9a-f]+}}  x11 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT:x12 = {{0x[0-9a-f]+}}  x13 = {{0x[0-9a-f]+}}  x14 = {{0x[0-9a-f]+}}  x15 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT:x16 = {{0x[0-9a-f]+}}  x17 = {{0x[0-9a-f]+}}  x18 = {{0x[0-9a-f]+}}  x19 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT:x20 = {{0x[0-9a-f]+}}  x21 = {{0x[0-9a-f]+}}  x22 = {{0x[0-9a-f]+}}  x23 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT:x24 = {{0x[0-9a-f]+}}  x25 = {{0x[0-9a-f]+}}  x26 = {{0x[0-9a-f]+}}  x27 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT:x28 = {{0x[0-9a-f]+}}   fp = {{0x[0-9a-f]+}}   lr = {{0x[0-9a-f]+}}   sp = {{0x[0-9a-f]+}}
+  // CHECK-NODUMP-NOT: Register values
+  return 0;
+}
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_arm.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_arm.cpp
new file mode 100644
index 0000000000000..e17dbf196227b
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_arm.cpp
@@ -0,0 +1,19 @@
+// Check that sanitizer prints registers dump_registers on dump_registers=1
+// RUN: %clangxx  %s -o %t
+// RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-NODUMP
+// RUN: not %run %t 2>&1 | FileCheck %s --strict-whitespace --check-prefix=CHECK-DUMP
+//
+// REQUIRES: arm-target-arch
+
+#include <signal.h>
+
+int main() {
+  raise(SIGSEGV);
+  // CHECK-DUMP: Register values
+  // CHECK-DUMP-NEXT: r0 = {{0x[0-9a-f]+}}   r1 = {{0x[0-9a-f]+}}   r2 = {{0x[0-9a-f]+}}   r3 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: r4 = {{0x[0-9a-f]+}}   r5 = {{0x[0-9a-f]+}}   r6 = {{0x[0-9a-f]+}}   r7 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: r8 = {{0x[0-9a-f]+}}   r9 = {{0x[0-9a-f]+}}  r10 = {{0x[0-9a-f]+}}  r11 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT:r12 = {{0x[0-9a-f]+}}   sp = {{0x[0-9a-f]+}}   lr = {{0x[0-9a-f]+}}   pc = {{0x[0-9a-f]+}}
+  // CHECK-NODUMP-NOT: Register values
+  return 0;
+}

>From 2ba1aeed2efd8156717886f89f6d4270b1df7a18 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda at apple.com>
Date: Wed, 24 Jul 2024 17:25:57 -0700
Subject: [PATCH 084/259] [lldb] Don't use a vm addr range starting at 0 for
 local memory (#100288)

When an inferior stub cannot allocate memory for lldb, and lldb needs to
store the result of expressions, it will do it in lldb's own memory
range ("host memory"). But it needs to find a virtual address range that
is not used in the inferior process. It tries to use the
qMemoryRegionInfo gdb remote serial protocol packet to find a range that
is inaccessible, starting at address 0 and moving up the size of each
region.

If the first region found at address 0 is inaccessible, lldb will use
the address range starting at 0 to mean "read lldb's host memory, not
the process memory", and programs that crash with a null dereference
will have poor behavior.

This patch skips consideration of a memory region that starts at address
0.

I also clarified the documentation of qMemoryRegionInfo to make it clear
that the stub is required to provide permissions for a memory range that
is accessable, it is not an optional key in this response. This issue
was originally found by a stub that did not list permissions in its
response, and lldb treated the first region returned as the one it would
use. (the stub also didn't support the memory-allocate packet)
---
 lldb/docs/resources/lldbgdbremote.md   |  8 +++++++-
 lldb/source/Expression/IRMemoryMap.cpp | 18 +++++++++++++-----
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/lldb/docs/resources/lldbgdbremote.md b/lldb/docs/resources/lldbgdbremote.md
index 7076a75032dae..5cac3736337a8 100644
--- a/lldb/docs/resources/lldbgdbremote.md
+++ b/lldb/docs/resources/lldbgdbremote.md
@@ -1403,6 +1403,12 @@ For instance, with a macOS process which has nothing mapped in the first
 The lack of `permissions:` indicates that none of read/write/execute are valid
 for this region.
 
+The stub must include `permissions:` key-value on all memory ranges
+that are valid to access in the inferior process -- the lack of
+`permissions:` means that this is an inaccessible (no page table
+entries exist, in a system using VM) memory range.  If a stub cannot
+determine actual permissions, return `rwx`.
+
 **Priority To Implement:** Medium
 
 This is nice to have, but it isn't necessary. It helps LLDB
@@ -2434,4 +2440,4 @@ The `0x` prefixes are optional - like most of the gdb-remote packets,
 omitting them will work fine; these numbers are always base 16.
 
 The length of the payload is not provided.  A reliable, 8-bit clean,
-transport layer is assumed.
\ No newline at end of file
+transport layer is assumed.
diff --git a/lldb/source/Expression/IRMemoryMap.cpp b/lldb/source/Expression/IRMemoryMap.cpp
index de631370bb048..0c1d9016616cb 100644
--- a/lldb/source/Expression/IRMemoryMap.cpp
+++ b/lldb/source/Expression/IRMemoryMap.cpp
@@ -84,7 +84,7 @@ lldb::addr_t IRMemoryMap::FindSpace(size_t size) {
   // any allocations.  Otherwise start at the beginning of memory.
 
   if (m_allocations.empty()) {
-    ret = 0x0;
+    ret = 0;
   } else {
     auto back = m_allocations.rbegin();
     lldb::addr_t addr = back->first;
@@ -116,10 +116,18 @@ lldb::addr_t IRMemoryMap::FindSpace(size_t size) {
     Status err = process_sp->GetMemoryRegionInfo(ret, region_info);
     if (err.Success()) {
       while (true) {
-        if (region_info.GetReadable() != MemoryRegionInfo::OptionalBool::eNo ||
-            region_info.GetWritable() != MemoryRegionInfo::OptionalBool::eNo ||
-            region_info.GetExecutable() !=
-                MemoryRegionInfo::OptionalBool::eNo) {
+        if (region_info.GetRange().GetRangeBase() == 0 &&
+            region_info.GetRange().GetRangeEnd() < end_of_memory) {
+          // Don't use a region that starts at address 0,
+          // it can make it harder to debug null dereference crashes
+          // in the inferior.
+          ret = region_info.GetRange().GetRangeEnd();
+        } else if (region_info.GetReadable() !=
+                       MemoryRegionInfo::OptionalBool::eNo ||
+                   region_info.GetWritable() !=
+                       MemoryRegionInfo::OptionalBool::eNo ||
+                   region_info.GetExecutable() !=
+                       MemoryRegionInfo::OptionalBool::eNo) {
           if (region_info.GetRange().GetRangeEnd() - 1 >= end_of_memory) {
             ret = LLDB_INVALID_ADDRESS;
             break;

>From ac1a1e5797388598201511d17f05aa088ef4a2e2 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl at google.com>
Date: Wed, 24 Jul 2024 18:23:14 -0700
Subject: [PATCH 085/259] [ThinLTO][TypeProf] Import local-linkage global var
 for mod1:func_foo-> mod2:local-var edge (#100448)

VTable value profiling can create reference edges from `mod1:func_foo`
to `mod2:local-vtable`. Indirect call profiling can create reference
edges from `mod1:func_foo` to `mod2:local_func_bar`.

Given a ref chain `mod1:func_foo -> mod2:local-var`,`local-var` doesn't
get imported by default.

Compiler checks / requires the module of 'local-var' is the same as the
function that referenced it(`mod1:func_foo`). This is to prevent
mis-compilation when both `mod1` and `mod2` has `local-var` of the same
name, and cpp files are compiled without full path.

This patch allows the import when one of the following conditions
happen:
1) Introduce an option `import-assume-local-unique`. When the compiler
user can guarantee that all files are compiled with full paths, they can
set this option.
2) When there is one instance of value summary.

Test:
* A/B testing this option alone gives -0.16% statistically consistent
cpu cycle reduction on one search workload (no throughput increase)
* Testing it together with existing more-efficient ICP bumps the
throughput increase by a margin (0.05%~0.1%)
* No regressions observed.
---
 .../Linux/instrprof-vtable-value-prof.cpp     | 65 ++++++++++-------
 llvm/lib/Transforms/IPO/FunctionImport.cpp    | 71 ++++++++++++-------
 2 files changed, 85 insertions(+), 51 deletions(-)

diff --git a/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp b/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
index 411640fc5176d..214c1504db7f4 100644
--- a/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
+++ b/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
@@ -5,9 +5,9 @@
 // ld.lld: error: /lib/../lib64/Scrt1.o: ABI version 1 is not supported
 // UNSUPPORTED: ppc && host-byteorder-big-endian
 
-// RUN: rm -rf %t && mkdir %t && cd %t
+// RUN: rm -rf %t && mkdir %t && split-file %s %t && cd %t
 
-// RUN: %clangxx_pgogen -fuse-ld=lld -O2 -fprofile-generate=. -mllvm -enable-vtable-value-profiling %s -o test
+// RUN: %clangxx_pgogen -fuse-ld=lld -O2 -fprofile-generate=. -mllvm -enable-vtable-value-profiling lib.cpp main.cpp -o test
 // RUN: env LLVM_PROFILE_FILE=test.profraw ./test
 
 // Show vtable profiles from raw profile.
@@ -37,23 +37,23 @@
 // COMMON-NEXT:  Number of instrumented vtables: 2
 // RAW:  Indirect Target Results:
 // RAW-NEXT:       [  0, _ZN8Derived14funcEii,        50 ] (25.00%)
-// RAW-NEXT:       [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived24funcEii,        150 ] (75.00%)
+// RAW-NEXT:       [  0, {{.*}}lib.cpp;_ZN12_GLOBAL__N_18Derived24funcEii,        150 ] (75.00%)
 // RAW-NEXT:       [  1, _ZN8Derived1D0Ev,        250 ] (25.00%)
-// RAW-NEXT:       [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived2D0Ev,        750 ] (75.00%)
+// RAW-NEXT:       [  1, {{.*}}lib.cpp;_ZN12_GLOBAL__N_18Derived2D0Ev,        750 ] (75.00%)
 // RAW-NEXT:  VTable Results:
 // RAW-NEXT:       [  0, _ZTV8Derived1,        50 ] (25.00%)
-// RAW-NEXT:       [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        150 ] (75.00%)
+// RAW-NEXT:       [  0, {{.*}}lib.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        150 ] (75.00%)
 // RAW-NEXT:       [  1, _ZTV8Derived1,        250 ] (25.00%)
-// RAW-NEXT:       [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
+// RAW-NEXT:       [  1, {{.*}}lib.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
 // INDEXED:     Indirect Target Results:
-// INDEXED-NEXT:         [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived24funcEii,        150 ] (75.00%)
+// INDEXED-NEXT:         [  0, {{.*}}lib.cpp;_ZN12_GLOBAL__N_18Derived24funcEii,        150 ] (75.00%)
 // INDEXED-NEXT:         [  0, _ZN8Derived14funcEii,        50 ] (25.00%)
-// INDEXED-NEXT:         [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived2D0Ev,        750 ] (75.00%)
+// INDEXED-NEXT:         [  1, {{.*}}lib.cpp;_ZN12_GLOBAL__N_18Derived2D0Ev,        750 ] (75.00%)
 // INDEXED-NEXT:         [  1, _ZN8Derived1D0Ev,        250 ] (25.00%)
 // INDEXED-NEXT:     VTable Results:
-// INDEXED-NEXT:         [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        150 ] (75.00%)
+// INDEXED-NEXT:         [  0, {{.*}}lib.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        150 ] (75.00%)
 // INDEXED-NEXT:         [  0, _ZTV8Derived1,        50 ] (25.00%)
-// INDEXED-NEXT:         [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
+// INDEXED-NEXT:         [  1, {{.*}}lib.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
 // INDEXED-NEXT:         [  1, _ZTV8Derived1,        250 ] (25.00%)
 // COMMON: Instrumentation level: IR  entry_first = 0
 // COMMON-NEXT: Functions shown: 1
@@ -93,27 +93,27 @@
 // ICTEXT: # NumValueSites:
 // ICTEXT: 2
 // ICTEXT: 2
-// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived24funcEii:150
+// ICTEXT: {{.*}}lib.cpp;_ZN12_GLOBAL__N_18Derived24funcEii:150
 // ICTEXT: _ZN8Derived14funcEii:50
 // ICTEXT: 2
-// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived2D0Ev:750
+// ICTEXT: {{.*}}lib.cpp;_ZN12_GLOBAL__N_18Derived2D0Ev:750
 // ICTEXT: _ZN8Derived1D0Ev:250
 // ICTEXT: # ValueKind = IPVK_VTableTarget:
 // ICTEXT: 2
 // ICTEXT: # NumValueSites:
 // ICTEXT: 2
 // ICTEXT: 2
-// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E:150
+// ICTEXT: {{.*}}lib.cpp;_ZTVN12_GLOBAL__N_18Derived2E:150
 // ICTEXT: _ZTV8Derived1:50
 // ICTEXT: 2
-// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E:750
+// ICTEXT: {{.*}}lib.cpp;_ZTVN12_GLOBAL__N_18Derived2E:750
 // ICTEXT: _ZTV8Derived1:250
 
 // When vtable value profiles exist, pgo-instr-use pass should annotate them
 // even if `-enable-vtable-value-profiling` is not explicitly on.
 // RUN: %clangxx -m64 -fprofile-use=test.profdata -fuse-ld=lld -O2 \
 // RUN:   -mllvm -print-after=pgo-instr-use -mllvm -filter-print-funcs=main \
-// RUN:   -mllvm -print-module-scope %s 2>&1 | FileCheck %s --check-prefix=ANNOTATE
+// RUN:   -mllvm -print-module-scope lib.cpp main.cpp 2>&1 | FileCheck %s --check-prefix=ANNOTATE
 
 // ANNOTATE-NOT: Inconsistent number of value sites
 // ANNOTATE: !{!"VP", i32 2
@@ -122,7 +122,7 @@
 // if `-icp-max-num-vtables` is set to zero.
 // RUN: %clangxx -m64 -fprofile-use=test.profdata -fuse-ld=lld -O2 \
 // RUN:   -mllvm -icp-max-num-vtables=0 -mllvm -print-after=pgo-instr-use \
-// RUN:   -mllvm -filter-print-funcs=main -mllvm -print-module-scope %s 2>&1 | \
+// RUN:   -mllvm -filter-print-funcs=main -mllvm -print-module-scope lib.cpp main.cpp 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=OMIT
 
 // OMIT: Inconsistent number of value sites
@@ -141,28 +141,29 @@
 // RUN:    -g -flto=thin -fwhole-program-vtables -fno-split-lto-unit -O2 \
 // RUN:    -mllvm -enable-vtable-value-profiling -Wl,-mllvm,-enable-vtable-value-profiling \
 // RUN:    -mllvm -enable-vtable-profile-use \
+// RUN:    -Wl,-plugin-opt,-import-assume-unique-local \
 // RUN:    -Wl,-mllvm,-enable-vtable-profile-use -Rpass=pgo-icall-prom \
 // RUN:    -Wl,-mllvm,-print-after=pgo-icall-prom \
-// RUN:    -Wl,-mllvm,-filter-print-funcs=main %s 2>&1 \
+// RUN:    -Wl,-mllvm,-filter-print-funcs=main lib.cpp main.cpp 2>&1 \
 // RUN:    | FileCheck %s --check-prefixes=REMARK,IR --implicit-check-not="!VP"
 
 // For the indirect call site `ptr->func`
-// REMARK: instrprof-vtable-value-prof.cpp:226:19: Promote indirect call to _ZN12_GLOBAL__N_18Derived24funcEii with count 150 out of 200, sink 1 instruction(s) and compare 1 vtable(s): {_ZTVN12_GLOBAL__N_18Derived2E}
-// REMARK: instrprof-vtable-value-prof.cpp:226:19: Promote indirect call to _ZN8Derived14funcEii with count 50 out of 50, sink 1 instruction(s) and compare 1 vtable(s): {_ZTV8Derived1}
+// REMARK: main.cpp:10:19: Promote indirect call to _ZN12_GLOBAL__N_18Derived24funcEii.llvm.{{.*}} with count 150 out of 200, sink 1 instruction(s) and compare 1 vtable(s): {_ZTVN12_GLOBAL__N_18Derived2E.llvm.{{.*}}}
+// REMARK: main.cpp:10:19: Promote indirect call to _ZN8Derived14funcEii with count 50 out of 50, sink 1 instruction(s) and compare 1 vtable(s): {_ZTV8Derived1}
 //
 // For the indirect call site `delete ptr`
-// REMARK: instrprof-vtable-value-prof.cpp:228:5: Promote indirect call to _ZN12_GLOBAL__N_18Derived2D0Ev with count 750 out of 1000, sink 2 instruction(s) and compare 1 vtable(s): {_ZTVN12_GLOBAL__N_18Derived2E}
-// REMARK: instrprof-vtable-value-prof.cpp:228:5: Promote indirect call to _ZN8Derived1D0Ev with count 250 out of 250, sink 2 instruction(s) and compare 1 vtable(s): {_ZTV8Derived1}
+// REMARK: main.cpp:12:5: Promote indirect call to _ZN12_GLOBAL__N_18Derived2D0Ev.llvm.{{.*}} with count 750 out of 1000, sink 2 instruction(s) and compare 1 vtable(s): {_ZTVN12_GLOBAL__N_18Derived2E.llvm.{{.*}}}
+// REMARK: main.cpp:12:5: Promote indirect call to _ZN8Derived1D0Ev with count 250 out of 250, sink 2 instruction(s) and compare 1 vtable(s): {_ZTV8Derived1}
 
 // The IR matchers for indirect callsite `ptr->func`.
 // IR-LABEL: @main
 // IR:   [[OBJ:%.*]] = {{.*}}call {{.*}} @_Z10createTypei
 // IR:   [[VTABLE:%.*]] = load ptr, ptr [[OBJ]]
-// IR:   [[CMP1:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @_ZTVN12_GLOBAL__N_18Derived2E, i32 16)
+// IR:   [[CMP1:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @_ZTVN12_GLOBAL__N_18Derived2E.llvm.{{.*}}, i32 16)
 // IR:   br i1 [[CMP1]], label %[[BB1:.*]], label %[[BB2:[a-zA-Z0-9_.]+]],
 //
 // IR: [[BB1]]:
-// IR:   [[RESBB1:%.*]] = {{.*}}call {{.*}} @_ZN12_GLOBAL__N_18Derived24funcEii
+// IR:   [[RESBB1:%.*]] = {{.*}}call {{.*}} @_ZN12_GLOBAL__N_18Derived24funcEii.llvm.{{.*}}
 // IR:   br label %[[MERGE0:[a-zA-Z0-9_.]+]]
 //
 // IR: [[BB2]]:
@@ -185,6 +186,7 @@
 // IR: [[MERGE0]]:
 // IR:    [[RES2:%.*]] = phi i32 [ [[RES1]], %[[MERGE1]] ], [ [[RESBB1]], %[[BB1]] ]
 
+//--- lib.h
 #include <stdio.h>
 #include <stdlib.h>
 class Base {
@@ -193,12 +195,19 @@ class Base {
 
   virtual ~Base() {};
 };
+
 class Derived1 : public Base {
 public:
-  int func(int a, int b) override { return a * b; }
+  int func(int a, int b) override;
 
   ~Derived1() {}
 };
+
+__attribute__((noinline)) Base *createType(int a);
+
+//--- lib.cpp
+#include "lib.h"
+
 namespace {
 class Derived2 : public Base {
 public:
@@ -207,7 +216,10 @@ class Derived2 : public Base {
   ~Derived2() {}
 };
 } // namespace
-__attribute__((noinline)) Base *createType(int a) {
+
+int Derived1::func(int a, int b) { return a * b; }
+
+Base *createType(int a) {
   Base *base = nullptr;
   if (a % 4 == 0)
     base = new Derived1();
@@ -216,6 +228,9 @@ __attribute__((noinline)) Base *createType(int a) {
   return base;
 }
 
+//--- main.cpp
+#include "lib.h"
+
 int main(int argc, char **argv) {
   int sum = 0;
   for (int i = 0; i < 1000; i++) {
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 038785114a0cf..547433c280869 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -174,6 +174,17 @@ static cl::opt<std::string> WorkloadDefinitions(
              "}"),
     cl::Hidden);
 
+static cl::opt<bool> ImportAssumeUniqueLocal(
+    "import-assume-unique-local", cl::init(false),
+    cl::desc(
+        "By default, a local-linkage global variable won't be imported in the "
+        "edge mod1:func -> mod2:local-var (from value profiles) since compiler "
+        "cannot assume mod2 is compiled with full path which gives local-var a "
+        "program-wide unique GUID. Set this option to true will help cross "
+        "module import of such variables. This is only safe if the compiler "
+        "user specify the full module path."),
+    cl::Hidden);
+
 namespace llvm {
 extern cl::opt<bool> EnableMemProfContextDisambiguation;
 }
@@ -196,6 +207,23 @@ static std::unique_ptr<Module> loadFile(const std::string &FileName,
   return Result;
 }
 
+static bool shouldSkipLocalInAnotherModule(const GlobalVarSummary *RefSummary,
+                                           size_t NumDefs,
+                                           StringRef ImporterModule) {
+  // We can import a local from another module if all inputs are compiled
+  // with full paths or when there is one definition.
+  if (ImportAssumeUniqueLocal || NumDefs == 1)
+    return false;
+  // In other cases, make sure we import the copy in the caller's module if the
+  // referenced value has local linkage. The only time a local variable can
+  // share an entry in the index is if there is a local with the same name in
+  // another module that had the same source file name (in a different
+  // directory), where each was compiled in their own directory so there was not
+  // distinguishing path.
+  return GlobalValue::isLocalLinkage(RefSummary->linkage()) &&
+         RefSummary->modulePath() != ImporterModule;
+}
+
 /// Given a list of possible callee implementation for a call site, qualify the
 /// legality of importing each. The return is a range of pairs. Each pair
 /// corresponds to a candidate. The first value is the ImportFailureReason for
@@ -228,19 +256,21 @@ static auto qualifyCalleeCandidates(
         if (!Summary)
           return {FunctionImporter::ImportFailureReason::GlobalVar, GVSummary};
 
-        // If this is a local function, make sure we import the copy
-        // in the caller's module. The only time a local function can
-        // share an entry in the index is if there is a local with the same name
-        // in another module that had the same source file name (in a different
-        // directory), where each was compiled in their own directory so there
-        // was not distinguishing path.
-        // However, do the import from another module if there is only one
-        // entry in the list - in that case this must be a reference due
-        // to indirect call profile data, since a function pointer can point to
-        // a local in another module.
-        if (GlobalValue::isLocalLinkage(Summary->linkage()) &&
-            CalleeSummaryList.size() > 1 &&
-            Summary->modulePath() != CallerModulePath)
+        // If this is a local function, make sure we import the copy in the
+        // caller's module. The only time a local function can share an entry in
+        // the index is if there is a local with the same name in another module
+        // that had the same source file name (in a different directory), where
+        // each was compiled in their own directory so there was not
+        // distinguishing path.
+        // If the local function is from another module, it must be a reference
+        // due to indirect call profile data since a function pointer can point
+        // to a local in another module. Do the import from another module if
+        // there is only one entry in the list or when all files in the program
+        // are compiled with full path - in both cases the local function has
+        // unique PGO name and GUID.
+        if (shouldSkipLocalInAnotherModule(dyn_cast<GlobalVarSummary>(Summary),
+                                           CalleeSummaryList.size(),
+                                           CallerModulePath))
           return {
               FunctionImporter::ImportFailureReason::LocalLinkageNotInModule,
               GVSummary};
@@ -359,18 +389,6 @@ class GlobalsImporter final {
 
       LLVM_DEBUG(dbgs() << " ref -> " << VI << "\n");
 
-      // If this is a local variable, make sure we import the copy
-      // in the caller's module. The only time a local variable can
-      // share an entry in the index is if there is a local with the same name
-      // in another module that had the same source file name (in a different
-      // directory), where each was compiled in their own directory so there
-      // was not distinguishing path.
-      auto LocalNotInModule =
-          [&](const GlobalValueSummary *RefSummary) -> bool {
-        return GlobalValue::isLocalLinkage(RefSummary->linkage()) &&
-               RefSummary->modulePath() != Summary.modulePath();
-      };
-
       for (const auto &RefSummary : VI.getSummaryList()) {
         const auto *GVS = dyn_cast<GlobalVarSummary>(RefSummary.get());
         // Functions could be referenced by global vars - e.g. a vtable; but we
@@ -379,7 +397,8 @@ class GlobalsImporter final {
         // based on profile information). Should we decide to handle them here,
         // we can refactor accordingly at that time.
         if (!GVS || !Index.canImportGlobalVar(GVS, /* AnalyzeRefs */ true) ||
-            LocalNotInModule(GVS))
+            shouldSkipLocalInAnotherModule(GVS, VI.getSummaryList().size(),
+                                           Summary.modulePath()))
           continue;
 
         // If there isn't an entry for GUID, insert <GUID, Definition> pair.

>From 8e43acbfedf53ded43ec693ddaaf518cb7416c1c Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 24 Jul 2024 21:06:19 -0500
Subject: [PATCH 086/259] [libc] Only add '-fno-builtin-*' on the entrypoints
 that use them (#100481)

Summary:
The GPU build needs to be able to inline stuff in LTO. Builtin
transformations cause problems on the functions that the optimizer does
heavy libcall recognition on. Previously we moved to using
`-fno-builtin-*` to allow us to only disable the problematic ones.
However, this still didn't allow inlining because each function had the
attribute that told the inliner not to inlining a nobuiltin function
into a non-nobuiltin function

This patch fixes that by only applying these attributes to the
entrypoints that define them. That is enough to prevent recursive calls
within the definitoins themselves.
---
 libc/cmake/modules/LLVMLibCCompileOptionRules.cmake | 11 +----------
 libc/cmake/modules/LLVMLibCObjectRules.cmake        | 11 +++++++++++
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 0f1ef6a575277..9fc10375a1d37 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -115,16 +115,7 @@ function(_get_common_compile_options output_var flags)
       list(APPEND compile_options "-ffixed-point")
     endif()
 
-    # Builtin recognition causes issues when trying to implement the builtin
-    # functions themselves. The GPU backends do not use libcalls so we disable
-    # the known problematic ones. This allows inlining during LTO linking.
-    if(LIBC_TARGET_OS_IS_GPU)
-      set(libc_builtins bcmp strlen memmem bzero memcmp memcpy memmem memmove
-                        memset strcmp strstr)
-      foreach(builtin ${libc_builtins})
-        list(APPEND compile_options "-fno-builtin-${builtin}")
-      endforeach()
-    else()
+    if(NOT LIBC_TARGET_OS_IS_GPU)
       list(APPEND compile_options "-fno-builtin")
     endif()
 
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index 2d3db38ecd8a3..68b5ed1ed51c0 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -279,6 +279,17 @@ function(create_entrypoint_object fq_target_name)
   add_dependencies(${fq_target_name} ${full_deps_list})
   target_link_libraries(${fq_target_name} ${full_deps_list})
 
+  # Builtin recognition causes issues when trying to implement the builtin
+  # functions themselves. The GPU backends do not use libcalls so we disable the
+  # known problematic ones on the entrypoints that implement them.
+  if(LIBC_TARGET_OS_IS_GPU)
+    set(libc_builtins bcmp strlen memmem bzero memcmp memcpy memmem memmove
+                      memset strcmp strstr)
+    if(${ADD_ENTRYPOINT_OBJ_NAME} IN_LIST libc_builtins)
+      target_compile_options(${fq_target_name} PRIVATE -fno-builtin-${ADD_ENTRYPOINT_OBJ_NAME})
+    endif()
+  endif()
+
   set_target_properties(
     ${fq_target_name}
     PROPERTIES

>From 7e7a9069d4240d2ae619cb50eba09f948c537ce3 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano at gmail.com>
Date: Wed, 24 Jul 2024 19:22:18 -0700
Subject: [PATCH 087/259] Revert "[clang-format] Fix a bug in annotating `*` in
 `#define`s (#99433)"

This reverts commit ce1a87437cc143889665c41046107e84cdf6246e.

Closes #100304.
---
 clang/lib/Format/TokenAnnotator.cpp           | 19 +++++-------------
 clang/unittests/Format/TokenAnnotatorTest.cpp | 20 -------------------
 2 files changed, 5 insertions(+), 34 deletions(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 21924a8fe17d1..5c11f3cb1a874 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -372,6 +372,10 @@ class AnnotatingParser {
                OpeningParen.Previous->is(tok::kw__Generic)) {
       Contexts.back().ContextType = Context::C11GenericSelection;
       Contexts.back().IsExpression = true;
+    } else if (Line.InPPDirective &&
+               (!OpeningParen.Previous ||
+                OpeningParen.Previous->isNot(tok::identifier))) {
+      Contexts.back().IsExpression = true;
     } else if (Contexts[Contexts.size() - 2].CaretFound) {
       // This is the parameter list of an ObjC block.
       Contexts.back().IsExpression = false;
@@ -384,20 +388,7 @@ class AnnotatingParser {
                OpeningParen.Previous->MatchingParen->isOneOf(
                    TT_ObjCBlockLParen, TT_FunctionTypeLParen)) {
       Contexts.back().IsExpression = false;
-    } else if (Line.InPPDirective) {
-      auto IsExpr = [&OpeningParen] {
-        const auto *Tok = OpeningParen.Previous;
-        if (!Tok || Tok->isNot(tok::identifier))
-          return true;
-        Tok = Tok->Previous;
-        while (Tok && Tok->endsSequence(tok::coloncolon, tok::identifier)) {
-          assert(Tok->Previous);
-          Tok = Tok->Previous->Previous;
-        }
-        return !Tok || !Tok->Tok.getIdentifierInfo();
-      };
-      Contexts.back().IsExpression = IsExpr();
-    } else if (!Line.MustBeDeclaration) {
+    } else if (!Line.MustBeDeclaration && !Line.InPPDirective) {
       bool IsForOrCatch =
           OpeningParen.Previous &&
           OpeningParen.Previous->isOneOf(tok::kw_for, tok::kw_catch);
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index b01ca322505b1..51810ad047a26 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -75,26 +75,6 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) {
   EXPECT_TOKEN(Tokens[10], tok::r_paren, TT_TypeDeclarationParen);
   EXPECT_TOKEN(Tokens[11], tok::star, TT_PointerOrReference);
 
-  Tokens = annotate("#define FOO bar(a * b)");
-  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
-  EXPECT_TOKEN(Tokens[6], tok::star, TT_BinaryOperator);
-
-  Tokens = annotate("#define FOO foo.bar(a & b)");
-  ASSERT_EQ(Tokens.size(), 12u) << Tokens;
-  EXPECT_TOKEN(Tokens[8], tok::amp, TT_BinaryOperator);
-
-  Tokens = annotate("#define FOO foo::bar(a && b)");
-  ASSERT_EQ(Tokens.size(), 12u) << Tokens;
-  EXPECT_TOKEN(Tokens[8], tok::ampamp, TT_BinaryOperator);
-
-  Tokens = annotate("#define FOO foo bar(a *b)");
-  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
-  EXPECT_TOKEN(Tokens[7], tok::star, TT_PointerOrReference);
-
-  Tokens = annotate("#define FOO void foo::bar(a &b)");
-  ASSERT_EQ(Tokens.size(), 13u) << Tokens;
-  EXPECT_TOKEN(Tokens[9], tok::amp, TT_PointerOrReference);
-
   Tokens = annotate("void f() {\n"
                     "  while (p < a && *p == 'a')\n"
                     "    p++;\n"

>From ccae7b461be339e717d02f99ac857cf0bc7d17fc Mon Sep 17 00:00:00 2001
From: Gedare Bloom <gedare at rtems.org>
Date: Wed, 24 Jul 2024 20:47:11 -0600
Subject: [PATCH 088/259] [clang-format] Improve BlockIndent at ColumnLimit
 (#93140)

Fixes #55731

The reported formatting problems were related to ignoring deep nesting
of "simple" functions (causing #54808) and to allowing the trailing
annotation to become separated from the closing parens, which allowed a
break to occur between the closing parens and the trailing annotation.
The fix for the nesting of "simple" functions is to detect them more
carefully. "Simple" was defined in a comment as being a single
non-expression argument. I tried to stay as close to the original intent
of the implementation while fixing the various bad formatting reports.

In the process of fixing these bugs, some latent bugs were discovered
related to how JavaScript Template Strings are handled. Those are also
fixed here.

---------

Co-authored-by: Owen Pan <owenpiano at gmail.com>
---
 clang/lib/Format/ContinuationIndenter.cpp     | 38 +++++++++++++++++--
 clang/lib/Format/FormatToken.h                |  1 +
 clang/lib/Format/TokenAnnotator.cpp           |  8 ++++
 clang/unittests/Format/FormatTest.cpp         | 25 ++++++++++++
 clang/unittests/Format/TokenAnnotatorTest.cpp | 32 ++++++++++++++++
 5 files changed, 101 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index b07360425ca6e..df86a774ba0f4 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -803,6 +803,37 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
     return !Tok.Previous->isOneOf(TT_CastRParen, tok::kw_for, tok::kw_while,
                                   tok::kw_switch);
   };
+  auto IsFunctionCallParen = [](const FormatToken &Tok) {
+    return Tok.is(tok::l_paren) && Tok.ParameterCount > 0 && Tok.Previous &&
+           Tok.Previous->is(tok::identifier);
+  };
+  const auto IsInTemplateString = [this](const FormatToken &Tok) {
+    if (!Style.isJavaScript())
+      return false;
+    for (const auto *Prev = &Tok; Prev; Prev = Prev->Previous) {
+      if (Prev->is(TT_TemplateString) && Prev->opensScope())
+        return true;
+      if (Prev->is(TT_TemplateString) && Prev->closesScope())
+        break;
+    }
+    return false;
+  };
+  // Identifies simple (no expression) one-argument function calls.
+  const auto IsSimpleFunction = [&](const FormatToken &Tok) {
+    if (!Tok.FakeLParens.empty() && Tok.FakeLParens.back() > prec::Unknown)
+      return false;
+    const auto *Previous = Tok.Previous;
+    if (!Previous || (!Previous->isOneOf(TT_FunctionDeclarationLParen,
+                                         TT_LambdaDefinitionLParen) &&
+                      !IsFunctionCallParen(*Previous))) {
+      return true;
+    }
+    if (IsOpeningBracket(Tok) || IsInTemplateString(Tok))
+      return true;
+    const auto *Next = Tok.Next;
+    return !Next || Next->isMemberAccess() ||
+           Next->is(TT_FunctionDeclarationLParen) || IsFunctionCallParen(*Next);
+  };
   if ((Style.AlignAfterOpenBracket == FormatStyle::BAS_AlwaysBreak ||
        Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent) &&
       IsOpeningBracket(Previous) && State.Column > getNewLineColumn(State) &&
@@ -813,10 +844,10 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
       //       caaaaaaaaaaaall(
       //           caaaaaaaaaaaall(
       //               caaaaaaaaaaaaaaaaaaaaaaall(aaaaaaaaaaaaaa, aaaaaaaaa))));
-      Current.FakeLParens.size() > 0 &&
-      Current.FakeLParens.back() > prec::Unknown) {
+      !IsSimpleFunction(Current)) {
     CurrentState.NoLineBreak = true;
   }
+
   if (Previous.is(TT_TemplateString) && Previous.opensScope())
     CurrentState.NoLineBreak = true;
 
@@ -831,7 +862,8 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
       Previous.isNot(TT_TableGenDAGArgOpenerToBreak) &&
       !(Current.MacroParent && Previous.MacroParent) &&
       (Current.isNot(TT_LineComment) ||
-       Previous.isOneOf(BK_BracedInit, TT_VerilogMultiLineListLParen))) {
+       Previous.isOneOf(BK_BracedInit, TT_VerilogMultiLineListLParen)) &&
+      !IsInTemplateString(Current)) {
     CurrentState.Indent = State.Column + Spaces;
     CurrentState.IsAligned = true;
   }
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index cc45d5a8c5c1e..abcedb66b57cc 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -102,6 +102,7 @@ namespace format {
   TYPE(JsTypeColon)                                                            \
   TYPE(JsTypeOperator)                                                         \
   TYPE(JsTypeOptionalQuestion)                                                 \
+  TYPE(LambdaDefinitionLParen)                                                 \
   TYPE(LambdaLBrace)                                                           \
   TYPE(LambdaLSquare)                                                          \
   TYPE(LeadingJavaAnnotation)                                                  \
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 5c11f3cb1a874..16ab18e1af959 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -62,6 +62,7 @@ static bool canBeObjCSelectorComponent(const FormatToken &Tok) {
 
 /// With `Left` being '(', check if we're at either `[...](` or
 /// `[...]<...>(`, where the [ opens a lambda capture list.
+// FIXME: this doesn't cover attributes/constraints before the l_paren.
 static bool isLambdaParameterList(const FormatToken *Left) {
   // Skip <...> if present.
   if (Left->Previous && Left->Previous->is(tok::greater) &&
@@ -365,6 +366,7 @@ class AnnotatingParser {
       Contexts.back().IsExpression = false;
     } else if (isLambdaParameterList(&OpeningParen)) {
       // This is a parameter list of a lambda expression.
+      OpeningParen.setType(TT_LambdaDefinitionLParen);
       Contexts.back().IsExpression = false;
     } else if (OpeningParen.is(TT_RequiresExpressionLParen)) {
       Contexts.back().IsExpression = false;
@@ -6196,6 +6198,12 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
     return !(Previous && (Previous->is(tok::kw_for) || Previous->isIf()));
   }
 
+  if (Left.isOneOf(tok::r_paren, TT_TrailingAnnotation) &&
+      Right.is(TT_TrailingAnnotation) &&
+      Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent) {
+    return false;
+  }
+
   // Allow breaking after a trailing annotation, e.g. after a method
   // declaration.
   if (Left.is(TT_TrailingAnnotation)) {
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 39fcbab3447a7..d7f81813835fa 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -9338,6 +9338,31 @@ TEST_F(FormatTest, AlignsAfterOpenBracket) {
       "    aaaaaaaaaaaaaaaa\n"
       ");",
       Style);
+  verifyFormat("bool aaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
+               "    const bool &aaaaaaaaa, const void *aaaaaaaaaa\n"
+               ") const {\n"
+               "  return true;\n"
+               "}",
+               Style);
+  verifyFormat("bool aaaaaaaaaaaaaaaaaaaaaaaa(\n"
+               "    const bool &aaaaaaaaaa, const void *aaaaaaaaaa\n"
+               ") const;",
+               Style);
+  verifyFormat("void aaaaaaaaa(\n"
+               "    int aaaaaa, int bbbbbb, int cccccc, int dddddddddd\n"
+               ") const noexcept -> std::vector<of_very_long_type>;",
+               Style);
+  verifyFormat(
+      "x = aaaaaaaaaaaaaaa(\n"
+      "    \"a aaaaaaa aaaaaaaaaaaaaaaaa aaaaaaaaaaaaaaa aaaaaaaaaaaaa\"\n"
+      ");",
+      Style);
+  Style.ColumnLimit = 60;
+  verifyFormat("auto lambda =\n"
+               "    [&b](\n"
+               "        auto aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
+               "    ) {};",
+               Style);
 }
 
 TEST_F(FormatTest, ParenthesesAndOperandAlignment) {
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 51810ad047a26..1f820d83c261a 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -1645,38 +1645,45 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   auto Tokens = annotate("[]() constexpr {}");
   ASSERT_EQ(Tokens.size(), 8u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
+  EXPECT_TOKEN(Tokens[2], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[5], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[]() consteval {}");
   ASSERT_EQ(Tokens.size(), 8u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
+  EXPECT_TOKEN(Tokens[2], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[5], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[]() mutable {}");
   ASSERT_EQ(Tokens.size(), 8u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
+  EXPECT_TOKEN(Tokens[2], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[5], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[]() static {}");
   ASSERT_EQ(Tokens.size(), 8u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
+  EXPECT_TOKEN(Tokens[2], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[5], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[]() -> auto {}");
   ASSERT_EQ(Tokens.size(), 9u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
+  EXPECT_TOKEN(Tokens[2], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[4], tok::arrow, TT_TrailingReturnArrow);
   EXPECT_TOKEN(Tokens[6], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[]() -> auto & {}");
   ASSERT_EQ(Tokens.size(), 10u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
+  EXPECT_TOKEN(Tokens[2], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[4], tok::arrow, TT_TrailingReturnArrow);
   EXPECT_TOKEN(Tokens[7], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[]() -> auto * {}");
   ASSERT_EQ(Tokens.size(), 10u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
+  EXPECT_TOKEN(Tokens[2], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[4], tok::arrow, TT_TrailingReturnArrow);
   EXPECT_TOKEN(Tokens[7], tok::l_brace, TT_LambdaLBrace);
 
@@ -1705,6 +1712,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   Tokens = annotate("foo([&](u32 bar) __attribute__((attr)) -> void {});");
   ASSERT_EQ(Tokens.size(), 22u) << Tokens;
   EXPECT_TOKEN(Tokens[2], tok::l_square, TT_LambdaLSquare);
+  EXPECT_TOKEN(Tokens[5], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[15], tok::arrow, TT_TrailingReturnArrow);
   EXPECT_TOKEN(Tokens[17], tok::l_brace, TT_LambdaLBrace);
 
@@ -1712,6 +1720,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[8], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <typename T> {}");
@@ -1724,6 +1733,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 12u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[7], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[9], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <typename... T> {}");
@@ -1736,6 +1746,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 12u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[7], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[9], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <int... T> {}");
@@ -1748,6 +1759,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 12u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[7], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[9], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <Foo... T> {}");
@@ -1761,6 +1773,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 18u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[10], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[14]->ClosesRequiresClause);
   EXPECT_TOKEN(Tokens[15], tok::l_brace, TT_LambdaLBrace);
@@ -1769,6 +1782,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 19u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[8], tok::ampamp, TT_PointerOrReference);
   EXPECT_TOKEN(Tokens[11], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[15]->ClosesRequiresClause);
@@ -1778,6 +1792,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 23u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[10], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[19]->ClosesRequiresClause);
   EXPECT_TOKEN(Tokens[20], tok::l_brace, TT_LambdaLBrace);
@@ -1786,6 +1801,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 20u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[10], tok::arrow, TT_TrailingReturnArrow);
   EXPECT_TOKEN(Tokens[12], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[16]->ClosesRequiresClause);
@@ -1797,6 +1813,8 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[6], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[10]->ClosesRequiresClause);
+  // FIXME:
+  // EXPECT_TOKEN(Tokens[11], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[15], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <typename T> requires Bar<T> (T &&t) {}");
@@ -1805,6 +1823,8 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[6], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[10]->ClosesRequiresClause);
+  // FIXME:
+  // EXPECT_TOKEN(Tokens[11], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[13], tok::ampamp, TT_PointerOrReference);
   EXPECT_TOKEN(Tokens[16], tok::l_brace, TT_LambdaLBrace);
 
@@ -1814,6 +1834,8 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[6], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[15]->ClosesRequiresClause);
+  // FIXME:
+  // EXPECT_TOKEN(Tokens[16], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[20], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <typename T> requires true (T&& t) {}");
@@ -1822,6 +1844,8 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[6], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[7]->ClosesRequiresClause);
+  // FIXME:
+  // EXPECT_TOKEN(Tokens[8], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[10], tok::ampamp, TT_PointerOrReference);
   EXPECT_TOKEN(Tokens[13], tok::l_brace, TT_LambdaLBrace);
 
@@ -1856,6 +1880,8 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[6], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[10]->ClosesRequiresClause);
+  // FIXME:
+  // EXPECT_TOKEN(Tokens[11], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[15], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[19]->ClosesRequiresClause);
   EXPECT_TOKEN(Tokens[20], tok::l_brace, TT_LambdaLBrace);
@@ -1865,6 +1891,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[7], tok::greater, TT_TemplateCloser);
+  EXPECT_TOKEN(Tokens[8], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[12], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <int I = 0> (T t) {}");
@@ -1872,6 +1899,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[7], tok::greater, TT_TemplateCloser);
+  EXPECT_TOKEN(Tokens[8], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[12], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <bool b = false> (T t) {}");
@@ -1879,6 +1907,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[7], tok::greater, TT_TemplateCloser);
+  EXPECT_TOKEN(Tokens[8], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[12], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <bool b = true && false> (T&& t) {}");
@@ -1887,6 +1916,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[7], tok::ampamp, TT_BinaryOperator);
   EXPECT_TOKEN(Tokens[9], tok::greater, TT_TemplateCloser);
+  EXPECT_TOKEN(Tokens[10], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[12], tok::ampamp, TT_PointerOrReference);
   EXPECT_TOKEN(Tokens[15], tok::l_brace, TT_LambdaLBrace);
 
@@ -1896,6 +1926,8 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[7], tok::greater, TT_TemplateCloser);
   EXPECT_TOKEN(Tokens[8], tok::kw_requires, TT_RequiresClause);
+  // FIXME:
+  // EXPECT_TOKEN(Tokens[13], tok::l_paren, TT_LambdaDefinitionLParen);
   EXPECT_TOKEN(Tokens[17], tok::l_brace, TT_LambdaLBrace);
 }
 

>From ba8883c46e3cb1782f66774a116bc0364ea68ce0 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl at google.com>
Date: Wed, 24 Jul 2024 21:27:46 -0700
Subject: [PATCH 089/259] Fix buildbot failure by fixing the base pointer type
 (#100508)

This should fix buildbot failures like
https://lab.llvm.org/buildbot/#/builders/169/builds/1448
---
 llvm/lib/Transforms/IPO/FunctionImport.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 547433c280869..210b79e3a0adc 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -207,7 +207,7 @@ static std::unique_ptr<Module> loadFile(const std::string &FileName,
   return Result;
 }
 
-static bool shouldSkipLocalInAnotherModule(const GlobalVarSummary *RefSummary,
+static bool shouldSkipLocalInAnotherModule(const GlobalValueSummary *RefSummary,
                                            size_t NumDefs,
                                            StringRef ImporterModule) {
   // We can import a local from another module if all inputs are compiled
@@ -268,8 +268,7 @@ static auto qualifyCalleeCandidates(
         // there is only one entry in the list or when all files in the program
         // are compiled with full path - in both cases the local function has
         // unique PGO name and GUID.
-        if (shouldSkipLocalInAnotherModule(dyn_cast<GlobalVarSummary>(Summary),
-                                           CalleeSummaryList.size(),
+        if (shouldSkipLocalInAnotherModule(Summary, CalleeSummaryList.size(),
                                            CallerModulePath))
           return {
               FunctionImporter::ImportFailureReason::LocalLinkageNotInModule,

>From 8608cc1c89640bd3d8120f24c964af21310253b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Thu, 25 Jul 2024 06:44:41 +0200
Subject: [PATCH 090/259] [clang][Interp] Fix array element This chains

The previous test was too minimal. If we actually do something after
initializing the nested array element, we end up causing a stack element
type mismatch.
---
 clang/lib/AST/Interp/Compiler.cpp | 5 +++--
 clang/test/AST/Interp/records.cpp | 4 ++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/Interp/Compiler.cpp b/clang/lib/AST/Interp/Compiler.cpp
index 4d5d725d6964c..df55d01b8b9d6 100644
--- a/clang/lib/AST/Interp/Compiler.cpp
+++ b/clang/lib/AST/Interp/Compiler.cpp
@@ -92,7 +92,7 @@ bool InitLink::emit(Compiler<Emitter> *Ctx, const Expr *E) const {
   case K_Elem:
     if (!Ctx->emitConstUint32(Offset, E))
       return false;
-    return Ctx->emitArrayElemPtrUint32(E);
+    return Ctx->emitArrayElemPtrPopUint32(E);
   default:
     llvm_unreachable("Unhandled InitLink kind");
   }
@@ -4156,7 +4156,8 @@ bool Compiler<Emitter>::VisitCXXThisExpr(const CXXThisExpr *E) {
   if (InitStackActive && !InitStack.empty()) {
     unsigned StartIndex = 0;
     for (StartIndex = InitStack.size() - 1; StartIndex > 0; --StartIndex) {
-      if (InitStack[StartIndex].Kind != InitLink::K_Field)
+      if (InitStack[StartIndex].Kind != InitLink::K_Field &&
+          InitStack[StartIndex].Kind != InitLink::K_Elem)
         break;
     }
 
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index d77e5a5c782ce..9551630caf3d6 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -1551,8 +1551,12 @@ namespace ArrayInitChain {
 
   constexpr CustomOperandVal A[] = {
     {},
+    {{"depctr_hold_cnt"},  12,   13},
   };
   static_assert(A[0].Str.S == nullptr, "");
   static_assert(A[0].Width == 0, "");
   static_assert(A[0].Mask == 1, "");
+
+  static_assert(A[1].Width == 12, "");
+  static_assert(A[1].Mask == 13, "");
 }

>From 25482b356e51de1f259b3ac7c785ab34977781b4 Mon Sep 17 00:00:00 2001
From: Chen Zheng <czhengsz at cn.ibm.com>
Date: Thu, 25 Jul 2024 13:46:59 +0800
Subject: [PATCH 091/259] [PowerPC] add TargetParser for PPC target (#97541)

For now only focus on the CPU type, will work on the CPU features part
later.

With the CPU handling in TargetParser, clang and llc/opt are able to
query common interfaces.

So we can set same default CPU and CPU features with same interfaces.
---
 clang/lib/Basic/Targets/PPC.cpp               |  18 +--
 clang/lib/Driver/ToolChains/Arch/PPC.cpp      |  73 -----------
 clang/lib/Driver/ToolChains/Arch/PPC.h        |   4 -
 clang/lib/Driver/ToolChains/Clang.cpp         |   7 +-
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |   6 +-
 clang/test/CodeGen/aix-builtin-cpu-is.c       |  42 +++---
 clang/test/CodeGen/builtin-cpu-supports.c     | 110 ++++++++++------
 clang/test/Misc/target-invalid-cpu-note.c     |   2 +-
 .../llvm/TargetParser/PPCTargetParser.def     |  62 ++++++++-
 .../llvm/TargetParser/PPCTargetParser.h       |  40 ++++++
 llvm/lib/TargetParser/CMakeLists.txt          |   1 +
 llvm/lib/TargetParser/PPCTargetParser.cpp     | 121 ++++++++++++++++++
 .../secondary/llvm/lib/TargetParser/BUILD.gn  |   1 +
 13 files changed, 324 insertions(+), 163 deletions(-)
 create mode 100644 llvm/include/llvm/TargetParser/PPCTargetParser.h
 create mode 100644 llvm/lib/TargetParser/PPCTargetParser.cpp

diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index 9ff54083c923b..d8203f76a5468 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -14,6 +14,7 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/MacroBuilder.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "llvm/TargetParser/PPCTargetParser.h"
 
 using namespace clang;
 using namespace clang::targets;
@@ -882,25 +883,12 @@ ArrayRef<TargetInfo::AddlRegName> PPCTargetInfo::getGCCAddlRegNames() const {
   return llvm::ArrayRef(GCCAddlRegNames);
 }
 
-static constexpr llvm::StringLiteral ValidCPUNames[] = {
-    {"generic"},   {"440"},     {"450"},         {"601"},     {"602"},
-    {"603"},       {"603e"},    {"603ev"},       {"604"},     {"604e"},
-    {"620"},       {"630"},     {"g3"},          {"7400"},    {"g4"},
-    {"7450"},      {"g4+"},     {"750"},         {"8548"},    {"970"},
-    {"g5"},        {"a2"},      {"e500"},        {"e500mc"},  {"e5500"},
-    {"power3"},    {"pwr3"},    {"power4"},      {"pwr4"},    {"power5"},
-    {"pwr5"},      {"power5x"}, {"pwr5x"},       {"power6"},  {"pwr6"},
-    {"power6x"},   {"pwr6x"},   {"power7"},      {"pwr7"},    {"power8"},
-    {"pwr8"},      {"power9"},  {"pwr9"},        {"power10"}, {"pwr10"},
-    {"power11"},   {"pwr11"},   {"powerpc"},     {"ppc"},     {"ppc32"},
-    {"powerpc64"}, {"ppc64"},   {"powerpc64le"}, {"ppc64le"}, {"future"}};
-
 bool PPCTargetInfo::isValidCPUName(StringRef Name) const {
-  return llvm::is_contained(ValidCPUNames, Name);
+  return llvm::PPC::isValidCPU(Name);
 }
 
 void PPCTargetInfo::fillValidCPUList(SmallVectorImpl<StringRef> &Values) const {
-  Values.append(std::begin(ValidCPUNames), std::end(ValidCPUNames));
+  llvm::PPC::fillValidCPUList(Values);
 }
 
 void PPCTargetInfo::adjust(DiagnosticsEngine &Diags, LangOptions &Opts) {
diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.cpp b/clang/lib/Driver/ToolChains/Arch/PPC.cpp
index acd5757d6ea97..57baa186a9eb7 100644
--- a/clang/lib/Driver/ToolChains/Arch/PPC.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/PPC.cpp
@@ -20,79 +20,6 @@ using namespace clang::driver::tools;
 using namespace clang;
 using namespace llvm::opt;
 
-static std::string getPPCGenericTargetCPU(const llvm::Triple &T) {
-  // LLVM may default to generating code for the native CPU,
-  // but, like gcc, we default to a more generic option for
-  // each architecture. (except on AIX)
-  if (T.isOSAIX())
-    return "pwr7";
-  else if (T.getArch() == llvm::Triple::ppc64le)
-    return "ppc64le";
-  else if (T.getArch() == llvm::Triple::ppc64)
-    return "ppc64";
-  else
-    return "ppc";
-}
-
-static std::string normalizeCPUName(StringRef CPUName, const llvm::Triple &T) {
-  // Clang/LLVM does not actually support code generation
-  // for the 405 CPU. However, there are uses of this CPU ID
-  // in projects that previously used GCC and rely on Clang
-  // accepting it. Clang has always ignored it and passed the
-  // generic CPU ID to the back end.
-  if (CPUName == "generic" || CPUName == "405")
-    return getPPCGenericTargetCPU(T);
-
-  if (CPUName == "native") {
-    std::string CPU = std::string(llvm::sys::getHostCPUName());
-    if (!CPU.empty() && CPU != "generic")
-      return CPU;
-    else
-      return getPPCGenericTargetCPU(T);
-  }
-
-  return llvm::StringSwitch<const char *>(CPUName)
-      .Case("common", "generic")
-      .Case("440fp", "440")
-      .Case("630", "pwr3")
-      .Case("G3", "g3")
-      .Case("G4", "g4")
-      .Case("G4+", "g4+")
-      .Case("8548", "e500")
-      .Case("G5", "g5")
-      .Case("power3", "pwr3")
-      .Case("power4", "pwr4")
-      .Case("power5", "pwr5")
-      .Case("power5x", "pwr5x")
-      .Case("power6", "pwr6")
-      .Case("power6x", "pwr6x")
-      .Case("power7", "pwr7")
-      .Case("power8", "pwr8")
-      .Case("power9", "pwr9")
-      .Case("power10", "pwr10")
-      .Case("power11", "pwr11")
-      .Case("future", "future")
-      .Case("powerpc", "ppc")
-      .Case("powerpc64", "ppc64")
-      .Case("powerpc64le", "ppc64le")
-      .Default(CPUName.data());
-}
-
-/// Get the (LLVM) name of the PowerPC cpu we are tuning for.
-std::string ppc::getPPCTuneCPU(const ArgList &Args, const llvm::Triple &T) {
-  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ))
-    return normalizeCPUName(A->getValue(), T);
-  return getPPCGenericTargetCPU(T);
-}
-
-/// Get the (LLVM) name of the PowerPC cpu we are targeting.
-std::string ppc::getPPCTargetCPU(const Driver &D, const ArgList &Args,
-                                 const llvm::Triple &T) {
-  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ))
-    return normalizeCPUName(A->getValue(), T);
-  return getPPCGenericTargetCPU(T);
-}
-
 const char *ppc::getPPCAsmModeForCPU(StringRef Name) {
   return llvm::StringSwitch<const char *>(Name)
       .Case("pwr7", "-mpower7")
diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.h b/clang/lib/Driver/ToolChains/Arch/PPC.h
index ec5b3c8140b66..89b9af92e8ddb 100644
--- a/clang/lib/Driver/ToolChains/Arch/PPC.h
+++ b/clang/lib/Driver/ToolChains/Arch/PPC.h
@@ -35,10 +35,6 @@ enum class ReadGOTPtrMode {
 
 FloatABI getPPCFloatABI(const Driver &D, const llvm::opt::ArgList &Args);
 
-std::string getPPCTargetCPU(const Driver &D, const llvm::opt::ArgList &Args,
-                            const llvm::Triple &T);
-std::string getPPCTuneCPU(const llvm::opt::ArgList &Args,
-                          const llvm::Triple &T);
 const char *getPPCAsmModeForCPU(StringRef Name);
 ReadGOTPtrMode getPPCReadGOTPtrMode(const Driver &D, const llvm::Triple &Triple,
                                     const llvm::opt::ArgList &Args);
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index d3edda3f10543..4a94df7d5f42e 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -60,6 +60,7 @@
 #include "llvm/TargetParser/ARMTargetParserCommon.h"
 #include "llvm/TargetParser/Host.h"
 #include "llvm/TargetParser/LoongArchTargetParser.h"
+#include "llvm/TargetParser/PPCTargetParser.h"
 #include "llvm/TargetParser/RISCVISAInfo.h"
 #include "llvm/TargetParser/RISCVTargetParser.h"
 #include <cctype>
@@ -2026,10 +2027,10 @@ void Clang::AddPPCTargetArgs(const ArgList &Args,
                              ArgStringList &CmdArgs) const {
   const Driver &D = getToolChain().getDriver();
   const llvm::Triple &T = getToolChain().getTriple();
-  if (Args.getLastArg(options::OPT_mtune_EQ)) {
+  if (Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) {
     CmdArgs.push_back("-tune-cpu");
-    std::string CPU = ppc::getPPCTuneCPU(Args, T);
-    CmdArgs.push_back(Args.MakeArgString(CPU));
+    StringRef CPU = llvm::PPC::getNormalizedPPCTuneCPU(T, A->getValue());
+    CmdArgs.push_back(Args.MakeArgString(CPU.str()));
   }
 
   // Select the ABI to use.
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 940ea3de492b1..1e37d9d348818 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -64,6 +64,7 @@
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/PPCTargetParser.h"
 #include "llvm/TargetParser/TargetParser.h"
 #include <optional>
 
@@ -634,7 +635,10 @@ std::string tools::getCPUName(const Driver &D, const ArgList &Args,
   case llvm::Triple::ppcle:
   case llvm::Triple::ppc64:
   case llvm::Triple::ppc64le:
-    return ppc::getPPCTargetCPU(D, Args, T);
+    if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ))
+      return std::string(
+          llvm::PPC::getNormalizedPPCTargetCPU(T, A->getValue()));
+    return std::string(llvm::PPC::getNormalizedPPCTargetCPU(T));
 
   case llvm::Triple::csky:
     if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ))
diff --git a/clang/test/CodeGen/aix-builtin-cpu-is.c b/clang/test/CodeGen/aix-builtin-cpu-is.c
index 04644dd7020e0..83e8c99e0a78d 100644
--- a/clang/test/CodeGen/aix-builtin-cpu-is.c
+++ b/clang/test/CodeGen/aix-builtin-cpu-is.c
@@ -1,52 +1,60 @@
-// RUN: echo "int main() { return __builtin_cpu_is(\"ppc970\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppc970\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"ppc-cell-be\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppc-cell-be\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"ppca2\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppca2\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"ppc405\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppc405\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"ppc440\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppc440\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"ppc464\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppc464\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"ppc476\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppc476\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"power4\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"power4\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"power5\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"power5\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"power5+\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"power5+\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"power6\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"power6\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"power6x\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"power6x\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"power7\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"power7\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=32768 \
 // RUN:   --check-prefix=CHECKOP
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"power8\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"pwr7\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=32768 \
+// RUN:   --check-prefix=CHECKOP
+
+// RUN: echo "int main() { return __builtin_cpu_is(\"power8\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=65536 \
 // RUN:   --check-prefix=CHECKOP
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"power9\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"power9\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=131072\
 // RUN:   --check-prefix=CHECKOP
 
-// RUN: echo "int main() { return __builtin_cpu_is(\"power10\");}" > %t.c 
+// RUN: echo "int main() { return __builtin_cpu_is(\"power10\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=262144 \
+// RUN:   --check-prefix=CHECKOP
+
+// RUN: echo "int main() { return __builtin_cpu_is(\"pwr10\");}" > %t.c
 // RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=262144 \
 // RUN:   --check-prefix=CHECKOP
 
@@ -67,7 +75,7 @@
 // CHECKOP-NEXT:   %retval = alloca i32, align 4
 // CHECKOP-NEXT:   store i32 0, ptr %retval, align 4
 // CHECKOP-NEXT:   %0 = load i32, ptr getelementptr inbounds ({ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i32, i32, i32, i32, i64, i64, i64, i64, i32, i32, i32, i32, i32, i32, i64, i32, i8, i8, i8, i8, i32, i32, i16, i16, [3 x i32], i32 }, ptr @_system_configuration, i32 0, i32 1), align 4
-// CHECKOP-NEXT:   %1 = icmp eq i32 %0, [[VALUE]] 
+// CHECKOP-NEXT:   %1 = icmp eq i32 %0, [[VALUE]]
 // CHECKOP-NEXT:  %conv = zext i1 %1 to i32
 // CHECKOP-NEXT:   ret i32 %conv
 // CHECKOP-NEXT: }
diff --git a/clang/test/CodeGen/builtin-cpu-supports.c b/clang/test/CodeGen/builtin-cpu-supports.c
index 26edc2c8fff08..92c407653e660 100644
--- a/clang/test/CodeGen/builtin-cpu-supports.c
+++ b/clang/test/CodeGen/builtin-cpu-supports.c
@@ -143,60 +143,82 @@ int v4() { return __builtin_cpu_supports("x86-64-v4"); }
 // CHECK-PPC-NEXT:    br label [[RETURN]]
 // CHECK-PPC:       if.else5:
 // CHECK-PPC-NEXT:    [[CPU_IS6:%.*]] = call i32 @llvm.ppc.fixed.addr.ld(i32 3)
-// CHECK-PPC-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[CPU_IS6]], 45
-// CHECK-PPC-NEXT:    br i1 [[TMP9]], label [[IF_THEN7:%.*]], label [[IF_ELSE9:%.*]]
+// CHECK-PPC-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[CPU_IS6]], 39
+// CHECK-PPC-NEXT:    br i1 [[TMP9]], label [[IF_THEN7:%.*]], label [[IF_ELSE8:%.*]]
 // CHECK-PPC:       if.then7:
 // CHECK-PPC-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4
-// CHECK-PPC-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP10]], 3
-// CHECK-PPC-NEXT:    store i32 [[ADD8]], ptr [[RETVAL]], align 4
+// CHECK-PPC-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 3
+// CHECK-PPC-NEXT:    store i32 [[MUL]], ptr [[RETVAL]], align 4
 // CHECK-PPC-NEXT:    br label [[RETURN]]
-// CHECK-PPC:       if.else9:
-// CHECK-PPC-NEXT:    [[CPU_IS10:%.*]] = call i32 @llvm.ppc.fixed.addr.ld(i32 3)
-// CHECK-PPC-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[CPU_IS10]], 46
-// CHECK-PPC-NEXT:    br i1 [[TMP11]], label [[IF_THEN11:%.*]], label [[IF_ELSE13:%.*]]
-// CHECK-PPC:       if.then11:
+// CHECK-PPC:       if.else8:
+// CHECK-PPC-NEXT:    [[CPU_IS9:%.*]] = call i32 @llvm.ppc.fixed.addr.ld(i32 3)
+// CHECK-PPC-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[CPU_IS9]], 33
+// CHECK-PPC-NEXT:    br i1 [[TMP11]], label [[IF_THEN10:%.*]], label [[IF_ELSE12:%.*]]
+// CHECK-PPC:       if.then10:
 // CHECK-PPC-NEXT:    [[TMP12:%.*]] = load i32, ptr [[A_ADDR]], align 4
-// CHECK-PPC-NEXT:    [[SUB12:%.*]] = sub nsw i32 [[TMP12]], 3
-// CHECK-PPC-NEXT:    store i32 [[SUB12]], ptr [[RETVAL]], align 4
+// CHECK-PPC-NEXT:    [[MUL11:%.*]] = mul nsw i32 [[TMP12]], 4
+// CHECK-PPC-NEXT:    store i32 [[MUL11]], ptr [[RETVAL]], align 4
 // CHECK-PPC-NEXT:    br label [[RETURN]]
-// CHECK-PPC:       if.else13:
-// CHECK-PPC-NEXT:    [[CPU_IS14:%.*]] = call i32 @llvm.ppc.fixed.addr.ld(i32 3)
-// CHECK-PPC-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[CPU_IS14]], 47
-// CHECK-PPC-NEXT:    br i1 [[TMP13]], label [[IF_THEN15:%.*]], label [[IF_ELSE17:%.*]]
-// CHECK-PPC:       if.then15:
+// CHECK-PPC:       if.else12:
+// CHECK-PPC-NEXT:    [[CPU_IS13:%.*]] = call i32 @llvm.ppc.fixed.addr.ld(i32 3)
+// CHECK-PPC-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[CPU_IS13]], 45
+// CHECK-PPC-NEXT:    br i1 [[TMP13]], label [[IF_THEN14:%.*]], label [[IF_ELSE16:%.*]]
+// CHECK-PPC:       if.then14:
 // CHECK-PPC-NEXT:    [[TMP14:%.*]] = load i32, ptr [[A_ADDR]], align 4
-// CHECK-PPC-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP14]], 7
-// CHECK-PPC-NEXT:    store i32 [[ADD16]], ptr [[RETVAL]], align 4
+// CHECK-PPC-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP14]], 3
+// CHECK-PPC-NEXT:    store i32 [[ADD15]], ptr [[RETVAL]], align 4
 // CHECK-PPC-NEXT:    br label [[RETURN]]
-// CHECK-PPC:       if.else17:
-// CHECK-PPC-NEXT:    [[CPU_IS18:%.*]] = call i32 @llvm.ppc.fixed.addr.ld(i32 3)
-// CHECK-PPC-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[CPU_IS18]], 48
-// CHECK-PPC-NEXT:    br i1 [[TMP15]], label [[IF_THEN19:%.*]], label [[IF_END:%.*]]
-// CHECK-PPC:       if.then19:
+// CHECK-PPC:       if.else16:
+// CHECK-PPC-NEXT:    [[CPU_IS17:%.*]] = call i32 @llvm.ppc.fixed.addr.ld(i32 3)
+// CHECK-PPC-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[CPU_IS17]], 46
+// CHECK-PPC-NEXT:    br i1 [[TMP15]], label [[IF_THEN18:%.*]], label [[IF_ELSE20:%.*]]
+// CHECK-PPC:       if.then18:
 // CHECK-PPC-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A_ADDR]], align 4
-// CHECK-PPC-NEXT:    [[SUB20:%.*]] = sub nsw i32 [[TMP16]], 7
-// CHECK-PPC-NEXT:    store i32 [[SUB20]], ptr [[RETVAL]], align 4
+// CHECK-PPC-NEXT:    [[SUB19:%.*]] = sub nsw i32 [[TMP16]], 3
+// CHECK-PPC-NEXT:    store i32 [[SUB19]], ptr [[RETVAL]], align 4
+// CHECK-PPC-NEXT:    br label [[RETURN]]
+// CHECK-PPC:       if.else20:
+// CHECK-PPC-NEXT:    [[CPU_IS21:%.*]] = call i32 @llvm.ppc.fixed.addr.ld(i32 3)
+// CHECK-PPC-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[CPU_IS21]], 47
+// CHECK-PPC-NEXT:    br i1 [[TMP17]], label [[IF_THEN22:%.*]], label [[IF_ELSE24:%.*]]
+// CHECK-PPC:       if.then22:
+// CHECK-PPC-NEXT:    [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-PPC-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP18]], 7
+// CHECK-PPC-NEXT:    store i32 [[ADD23]], ptr [[RETVAL]], align 4
+// CHECK-PPC-NEXT:    br label [[RETURN]]
+// CHECK-PPC:       if.else24:
+// CHECK-PPC-NEXT:    [[CPU_IS25:%.*]] = call i32 @llvm.ppc.fixed.addr.ld(i32 3)
+// CHECK-PPC-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[CPU_IS25]], 48
+// CHECK-PPC-NEXT:    br i1 [[TMP19]], label [[IF_THEN26:%.*]], label [[IF_END:%.*]]
+// CHECK-PPC:       if.then26:
+// CHECK-PPC-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-PPC-NEXT:    [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 7
+// CHECK-PPC-NEXT:    store i32 [[SUB27]], ptr [[RETVAL]], align 4
 // CHECK-PPC-NEXT:    br label [[RETURN]]
 // CHECK-PPC:       if.end:
-// CHECK-PPC-NEXT:    br label [[IF_END21:%.*]]
-// CHECK-PPC:       if.end21:
-// CHECK-PPC-NEXT:    br label [[IF_END22:%.*]]
-// CHECK-PPC:       if.end22:
-// CHECK-PPC-NEXT:    br label [[IF_END23:%.*]]
-// CHECK-PPC:       if.end23:
-// CHECK-PPC-NEXT:    br label [[IF_END24:%.*]]
-// CHECK-PPC:       if.end24:
-// CHECK-PPC-NEXT:    br label [[IF_END25:%.*]]
-// CHECK-PPC:       if.end25:
-// CHECK-PPC-NEXT:    br label [[IF_END26:%.*]]
-// CHECK-PPC:       if.end26:
-// CHECK-PPC-NEXT:    [[TMP17:%.*]] = load i32, ptr [[A_ADDR]], align 4
-// CHECK-PPC-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP17]], 5
-// CHECK-PPC-NEXT:    store i32 [[ADD27]], ptr [[RETVAL]], align 4
+// CHECK-PPC-NEXT:    br label [[IF_END28:%.*]]
+// CHECK-PPC:       if.end28:
+// CHECK-PPC-NEXT:    br label [[IF_END29:%.*]]
+// CHECK-PPC:       if.end29:
+// CHECK-PPC-NEXT:    br label [[IF_END30:%.*]]
+// CHECK-PPC:       if.end30:
+// CHECK-PPC-NEXT:    br label [[IF_END31:%.*]]
+// CHECK-PPC:       if.end31:
+// CHECK-PPC-NEXT:    br label [[IF_END32:%.*]]
+// CHECK-PPC:       if.end32:
+// CHECK-PPC-NEXT:    br label [[IF_END33:%.*]]
+// CHECK-PPC:       if.end33:
+// CHECK-PPC-NEXT:    br label [[IF_END34:%.*]]
+// CHECK-PPC:       if.end34:
+// CHECK-PPC-NEXT:    br label [[IF_END35:%.*]]
+// CHECK-PPC:       if.end35:
+// CHECK-PPC-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-PPC-NEXT:    [[ADD36:%.*]] = add nsw i32 [[TMP21]], 5
+// CHECK-PPC-NEXT:    store i32 [[ADD36]], ptr [[RETVAL]], align 4
 // CHECK-PPC-NEXT:    br label [[RETURN]]
 // CHECK-PPC:       return:
-// CHECK-PPC-NEXT:    [[TMP18:%.*]] = load i32, ptr [[RETVAL]], align 4
-// CHECK-PPC-NEXT:    ret i32 [[TMP18]]
+// CHECK-PPC-NEXT:    [[TMP22:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK-PPC-NEXT:    ret i32 [[TMP22]]
 //
 int test_ppc(int a) {
   if (__builtin_cpu_supports("arch_3_00")) // HWCAP2
@@ -205,6 +227,10 @@ int test_ppc(int a) {
     return a - 5;
   else if (__builtin_cpu_is("power7"))     // CPUID
     return a + a;
+  else if (__builtin_cpu_is("pwr7"))     // CPUID
+    return a * 3;
+  else if (__builtin_cpu_is("ppc970"))     // CPUID
+    return a * 4;
   else if (__builtin_cpu_is("power8"))
     return a + 3;
   else if (__builtin_cpu_is("power9"))
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index 4d6759dd81537..e0757b69242a8 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -57,7 +57,7 @@
 
 // RUN: not %clang_cc1 -triple powerpc--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix PPC
 // PPC: error: unknown target CPU 'not-a-cpu'
-// PPC-NEXT: note: valid target CPU values are: generic, 440, 450, 601, 602, 603, 603e, 603ev, 604, 604e, 620, 630, g3, 7400, g4, 7450, g4+, 750, 8548, 970, g5, a2, e500, e500mc, e5500, power3, pwr3, power4, pwr4, power5, pwr5, power5x, pwr5x, power6, pwr6, power6x, pwr6x, power7, pwr7, power8, pwr8, power9, pwr9, power10, pwr10, power11, pwr11, powerpc, ppc, ppc32, powerpc64, ppc64, powerpc64le, ppc64le, future{{$}}
+// PPC-NEXT: note: valid target CPU values are: generic, 440, 440fp, ppc440, 450, 601, 602, 603, 603e, 603ev, 604, 604e, 620, 630, g3, 7400, g4, 7450, g4+, 750, 8548, ppc405, ppc464, ppc476, 970, ppc970, g5, a2, ppca2, ppc-cell-be, e500, e500mc, e5500, power3, pwr3, pwr4, power4, pwr5, power5, pwr5+, power5+, pwr5x, power5x, pwr6, power6, pwr6x, power6x, pwr7, power7, pwr8, power8, pwr9, power9, pwr10, power10, pwr11, power11, powerpc, ppc, ppc32, powerpc64, ppc64, powerpc64le, ppc64le, future{{$}}
 
 // RUN: not %clang_cc1 -triple mips--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix MIPS
 // MIPS: error: unknown target CPU 'not-a-cpu'
diff --git a/llvm/include/llvm/TargetParser/PPCTargetParser.def b/llvm/include/llvm/TargetParser/PPCTargetParser.def
index df956a68d75d6..da4be3e39f2c7 100644
--- a/llvm/include/llvm/TargetParser/PPCTargetParser.def
+++ b/llvm/include/llvm/TargetParser/PPCTargetParser.def
@@ -89,23 +89,71 @@
 
 // __builtin_cpu_is() and __builtin_cpu_supports() are supported only on Power7 and up on AIX.
 // PPC_CPU(Name, Linux_SUPPORT_METHOD, LinuxID, AIX_SUPPORT_METHOD, AIXID)
-PPC_CPU("power4",SYS_CALL,32,BUILTIN_PPC_FALSE,0)
+PPC_CPU("generic",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("440",SYS_CALL,42,BUILTIN_PPC_FALSE,0)
+PPC_CPU("440fp",SYS_CALL,42,BUILTIN_PPC_FALSE,0)
+PPC_CPU("ppc440",SYS_CALL,42,BUILTIN_PPC_FALSE,0)
+PPC_CPU("450",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("601",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("602",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("603",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("603e",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("603ev",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("604",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("604e",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("620",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("630",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("g3",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("7400",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("g4",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("7450",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("g4+",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("750",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("8548",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("ppc405",SYS_CALL,41,BUILTIN_PPC_FALSE,0)
+PPC_CPU("ppc464",SYS_CALL,43,BUILTIN_PPC_FALSE,0)
+PPC_CPU("ppc476",SYS_CALL,44,BUILTIN_PPC_FALSE,0)
+PPC_CPU("970",SYS_CALL,33,BUILTIN_PPC_FALSE,0)
 PPC_CPU("ppc970",SYS_CALL,33,BUILTIN_PPC_FALSE,0)
+PPC_CPU("g5",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("a2",SYS_CALL,40,BUILTIN_PPC_FALSE,0)
+PPC_CPU("ppca2",SYS_CALL,40,BUILTIN_PPC_FALSE,0)
+PPC_CPU("ppc-cell-be",SYS_CALL,37,BUILTIN_PPC_FALSE,0)
+PPC_CPU("e500",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("e500mc",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("e5500",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("power3",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("pwr3",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("pwr4",SYS_CALL,32,BUILTIN_PPC_FALSE,0)
+PPC_CPU("power4",SYS_CALL,32,BUILTIN_PPC_FALSE,0)
+PPC_CPU("pwr5",SYS_CALL,34,BUILTIN_PPC_FALSE,0)
 PPC_CPU("power5",SYS_CALL,34,BUILTIN_PPC_FALSE,0)
+PPC_CPU("pwr5+",SYS_CALL,35,BUILTIN_PPC_FALSE,0)
 PPC_CPU("power5+",SYS_CALL,35,BUILTIN_PPC_FALSE,0)
+PPC_CPU("pwr5x",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("power5x",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("pwr6",SYS_CALL,36,BUILTIN_PPC_FALSE,0)
 PPC_CPU("power6",SYS_CALL,36,BUILTIN_PPC_FALSE,0)
-PPC_CPU("ppc-cell-be",SYS_CALL,37,BUILTIN_PPC_FALSE,0)
+PPC_CPU("pwr6x",SYS_CALL,38,BUILTIN_PPC_FALSE,0)
 PPC_CPU("power6x",SYS_CALL,38,BUILTIN_PPC_FALSE,0)
+PPC_CPU("pwr7",SYS_CALL,39,USE_SYS_CONF,AIX_PPC7_VALUE)
 PPC_CPU("power7",SYS_CALL,39,USE_SYS_CONF,AIX_PPC7_VALUE)
-PPC_CPU("ppca2",SYS_CALL,40,BUILTIN_PPC_FALSE,0)
-PPC_CPU("ppc405",SYS_CALL,41,BUILTIN_PPC_FALSE,0)
-PPC_CPU("ppc440",SYS_CALL,42,BUILTIN_PPC_FALSE,0)
-PPC_CPU("ppc464",SYS_CALL,43,BUILTIN_PPC_FALSE,0)
-PPC_CPU("ppc476",SYS_CALL,44,BUILTIN_PPC_FALSE,0)
+PPC_CPU("pwr8",SYS_CALL,45,USE_SYS_CONF,AIX_PPC8_VALUE)
 PPC_CPU("power8",SYS_CALL,45,USE_SYS_CONF,AIX_PPC8_VALUE)
+PPC_CPU("pwr9",SYS_CALL,46,USE_SYS_CONF,AIX_PPC9_VALUE)
 PPC_CPU("power9",SYS_CALL,46,USE_SYS_CONF,AIX_PPC9_VALUE)
+PPC_CPU("pwr10",SYS_CALL,47,USE_SYS_CONF,AIX_PPC10_VALUE)
 PPC_CPU("power10",SYS_CALL,47,USE_SYS_CONF,AIX_PPC10_VALUE)
+PPC_CPU("pwr11",SYS_CALL,48,USE_SYS_CONF,AIX_PPC11_VALUE)
 PPC_CPU("power11",SYS_CALL,48,USE_SYS_CONF,AIX_PPC11_VALUE)
+PPC_CPU("powerpc",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("ppc",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("ppc32",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("powerpc64",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("ppc64",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("powerpc64le",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("ppc64le",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
+PPC_CPU("future",BUILTIN_PPC_UNSUPPORTED,0,BUILTIN_PPC_UNSUPPORTED,0)
 #undef PPC_CPU
 
 // PPC features on Linux:
diff --git a/llvm/include/llvm/TargetParser/PPCTargetParser.h b/llvm/include/llvm/TargetParser/PPCTargetParser.h
new file mode 100644
index 0000000000000..5f9fe543aff0b
--- /dev/null
+++ b/llvm/include/llvm/TargetParser/PPCTargetParser.h
@@ -0,0 +1,40 @@
+//===---- PPCTargetParser - Parser for target features ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a target parser to recognise hardware features
+// for PPC CPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGETPARSER_PPCTARGETPARSER_H
+#define LLVM_TARGETPARSER_PPCTARGETPARSER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/TargetParser/Triple.h"
+
+namespace llvm {
+namespace PPC {
+bool isValidCPU(StringRef CPU);
+void fillValidCPUList(SmallVectorImpl<StringRef> &Values);
+void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values);
+
+// Get target CPU name.
+// If CPUName is empty or generic, return the default CPU name.
+// If CPUName is not empty or generic, return the normalized CPU name.
+StringRef getNormalizedPPCTargetCPU(const Triple &T, StringRef CPUName = "");
+
+// Get the tune CPU name.
+StringRef getNormalizedPPCTuneCPU(const Triple &T, StringRef CPUName = "");
+
+// For PPC, there are some cpu names for same CPU, like pwr10 and power10,
+// normalize them.
+StringRef normalizeCPUName(StringRef CPUName);
+} // namespace PPC
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/TargetParser/CMakeLists.txt b/llvm/lib/TargetParser/CMakeLists.txt
index 4b5d582d57a42..8ec32f7410566 100644
--- a/llvm/lib/TargetParser/CMakeLists.txt
+++ b/llvm/lib/TargetParser/CMakeLists.txt
@@ -20,6 +20,7 @@ add_llvm_component_library(LLVMTargetParser
   CSKYTargetParser.cpp
   Host.cpp
   LoongArchTargetParser.cpp
+  PPCTargetParser.cpp
   RISCVISAInfo.cpp
   RISCVTargetParser.cpp
   SubtargetFeature.cpp
diff --git a/llvm/lib/TargetParser/PPCTargetParser.cpp b/llvm/lib/TargetParser/PPCTargetParser.cpp
new file mode 100644
index 0000000000000..06a18f54affd8
--- /dev/null
+++ b/llvm/lib/TargetParser/PPCTargetParser.cpp
@@ -0,0 +1,121 @@
+//===---- PPCTargetParser.cpp - Parser for target features ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a target parser to recognise hardware features
+// for PPC CPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/TargetParser/PPCTargetParser.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/TargetParser/Host.h"
+
+namespace llvm {
+namespace PPC {
+
+struct CPUInfo {
+  StringLiteral Name;
+  // FIXME: add the features field for this CPU.
+};
+
+constexpr CPUInfo PPCCPUInfo[] = {
+#define PPC_CPU(Name, Linux_SUPPORT_METHOD, LinuxID, AIX_SUPPORT_METHOD,       \
+                AIXID)                                                         \
+  Name,
+#include "llvm/TargetParser/PPCTargetParser.def"
+};
+
+static const CPUInfo *getCPUInfoByName(StringRef CPU) {
+  for (auto &C : PPCCPUInfo)
+    if (C.Name == CPU)
+      return &C;
+  return nullptr;
+}
+
+StringRef normalizeCPUName(StringRef CPUName) {
+  // Clang/LLVM does not actually support code generation
+  // for the 405 CPU. However, there are uses of this CPU ID
+  // in projects that previously used GCC and rely on Clang
+  // accepting it. Clang has always ignored it and passed the
+  // generic CPU ID to the back end.
+  return StringSwitch<StringRef>(CPUName)
+      .Cases("common", "405", "generic")
+      .Cases("ppc440", "440fp", "440")
+      .Cases("630", "power3", "pwr3")
+      .Case("G3", "g3")
+      .Case("G4", "g4")
+      .Case("G4+", "g4+")
+      .Case("8548", "e500")
+      .Case("ppc970", "970")
+      .Case("G5", "g5")
+      .Case("ppca2", "a2")
+      .Case("power4", "pwr4")
+      .Case("power5", "pwr5")
+      .Case("power5x", "pwr5x")
+      .Case("power5+", "pwr5+")
+      .Case("power6", "pwr6")
+      .Case("power6x", "pwr6x")
+      .Case("power7", "pwr7")
+      .Case("power8", "pwr8")
+      .Case("power9", "pwr9")
+      .Case("power10", "pwr10")
+      .Case("power11", "pwr11")
+      .Cases("powerpc", "powerpc32", "ppc")
+      .Case("powerpc64", "ppc64")
+      .Case("powerpc64le", "ppc64le")
+      .Default(CPUName);
+}
+
+void fillValidCPUList(SmallVectorImpl<StringRef> &Values) {
+  for (const auto &C : PPCCPUInfo)
+    Values.emplace_back(C.Name);
+}
+
+void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values) {
+  for (const auto &C : PPCCPUInfo)
+    Values.emplace_back(C.Name);
+}
+
+bool isValidCPU(StringRef CPU) {
+  const CPUInfo *Info = getCPUInfoByName(CPU);
+  if (!Info)
+    return false;
+  return true;
+}
+
+StringRef getNormalizedPPCTargetCPU(const Triple &T, StringRef CPUName) {
+  if (!CPUName.empty()) {
+    if (CPUName == "native") {
+      std::string CPU = std::string(sys::getHostCPUName());
+      if (!CPU.empty() && CPU != "generic")
+        return CPU;
+    }
+
+    StringRef CPU = normalizeCPUName(CPUName);
+    if (CPU != "generic")
+      return CPU;
+  }
+
+  // LLVM may default to generating code for the native CPU, but, like gcc, we
+  // default to a more generic option for each architecture. (except on AIX)
+  if (T.isOSAIX())
+    return "pwr7";
+  else if (T.getArch() == Triple::ppc64le)
+    return "ppc64le";
+  else if (T.getArch() == Triple::ppc64)
+    return "ppc64";
+
+  return "ppc";
+}
+
+StringRef getNormalizedPPCTuneCPU(const Triple &T, StringRef CPUName) {
+  return getNormalizedPPCTargetCPU(T, CPUName);
+}
+
+} // namespace PPC
+} // namespace llvm
diff --git a/llvm/utils/gn/secondary/llvm/lib/TargetParser/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/TargetParser/BUILD.gn
index 31919badac7be..3dbc803d0d483 100644
--- a/llvm/utils/gn/secondary/llvm/lib/TargetParser/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/TargetParser/BUILD.gn
@@ -13,6 +13,7 @@ static_library("TargetParser") {
     "CSKYTargetParser.cpp",
     "Host.cpp",
     "LoongArchTargetParser.cpp",
+    "PPCTargetParser.cpp",
     "RISCVISAInfo.cpp",
     "RISCVTargetParser.cpp",
     "SubtargetFeature.cpp",

>From fe8d1e61ac4104f48bbefc6959a084a8e16927e9 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Thu, 25 Jul 2024 07:52:38 +0200
Subject: [PATCH 092/259] [AMDGPU][NFC] Use GFX940 prefix in dpp64_combine test
 (#100459)

---
 llvm/test/CodeGen/AMDGPU/dpp64_combine.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
index 3d8cda589f46f..60ce730c3eed3 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,GFX90A
-; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,DPPMOV64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,DPPMOV64,GFX940
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX10
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX11
 
@@ -74,7 +74,7 @@ define amdgpu_kernel void @dpp64_div(ptr addrspace(1) %arg, i64 %in1) {
 ; DPP64: v_mov_b32_dpp
 ; GFX90A: v_add_co_u32_e32
 ; GFX90A: v_addc_co_u32_e32
-; DPPMOV64: v_lshl_add_u64
+; GFX940: v_lshl_add_u64
 ; GFX10: v_mov_b32_dpp
 ; GFX10: v_add_co_u32
 ; GFX10: v_add_co_ci_u32_e32

>From 1b7631a699e6af7f497548a1ceb5be0570c60ed0 Mon Sep 17 00:00:00 2001
From: PeterChou1 <peter.chou at mail.utoronto.ca>
Date: Thu, 25 Jul 2024 02:03:20 -0400
Subject: [PATCH 093/259] [clang-doc] Improve clang-doc performance through
 memoization (#96809)

---
 clang-tools-extra/clang-doc/Mapper.cpp        | 44 +++++++++++++++----
 clang-tools-extra/clang-doc/Mapper.h          |  2 +-
 .../clang-doc/tool/ClangDocMain.cpp           |  1 -
 3 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/clang-tools-extra/clang-doc/Mapper.cpp b/clang-tools-extra/clang-doc/Mapper.cpp
index bb8b7952980ac..6c90db03424c6 100644
--- a/clang-tools-extra/clang-doc/Mapper.cpp
+++ b/clang-tools-extra/clang-doc/Mapper.cpp
@@ -12,16 +12,28 @@
 #include "clang/AST/Comment.h"
 #include "clang/Index/USRGeneration.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/Error.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/Mutex.h"
 
 namespace clang {
 namespace doc {
 
+static llvm::StringSet<> USRVisited;
+static llvm::sys::Mutex USRVisitedGuard;
+
+template <typename T> bool isTypedefAnonRecord(const T *D) {
+  if (const auto *C = dyn_cast<CXXRecordDecl>(D)) {
+    return C->getTypedefNameForAnonDecl();
+  }
+  return false;
+}
+
 void MapASTVisitor::HandleTranslationUnit(ASTContext &Context) {
   TraverseDecl(Context.getTranslationUnitDecl());
 }
 
-template <typename T> bool MapASTVisitor::mapDecl(const T *D) {
+template <typename T>
+bool MapASTVisitor::mapDecl(const T *D, bool IsDefinition) {
   // If we're looking a decl not in user files, skip this decl.
   if (D->getASTContext().getSourceManager().isInSystemHeader(D->getLocation()))
     return true;
@@ -34,6 +46,16 @@ template <typename T> bool MapASTVisitor::mapDecl(const T *D) {
   // If there is an error generating a USR for the decl, skip this decl.
   if (index::generateUSRForDecl(D, USR))
     return true;
+  // Prevent Visiting USR twice
+  {
+    std::lock_guard<llvm::sys::Mutex> Guard(USRVisitedGuard);
+    StringRef Visited = USR.str();
+    if (USRVisited.count(Visited) && !isTypedefAnonRecord<T>(D))
+      return true;
+    // We considered a USR to be visited only when its defined
+    if (IsDefinition)
+      USRVisited.insert(Visited);
+  }
   bool IsFileInRootDir;
   llvm::SmallString<128> File =
       getFile(D, D->getASTContext(), CDCtx.SourceRoot, IsFileInRootDir);
@@ -53,30 +75,34 @@ template <typename T> bool MapASTVisitor::mapDecl(const T *D) {
 }
 
 bool MapASTVisitor::VisitNamespaceDecl(const NamespaceDecl *D) {
-  return mapDecl(D);
+  return mapDecl(D, /*isDefinition=*/true);
 }
 
-bool MapASTVisitor::VisitRecordDecl(const RecordDecl *D) { return mapDecl(D); }
+bool MapASTVisitor::VisitRecordDecl(const RecordDecl *D) {
+  return mapDecl(D, D->isThisDeclarationADefinition());
+}
 
-bool MapASTVisitor::VisitEnumDecl(const EnumDecl *D) { return mapDecl(D); }
+bool MapASTVisitor::VisitEnumDecl(const EnumDecl *D) {
+  return mapDecl(D, D->isThisDeclarationADefinition());
+}
 
 bool MapASTVisitor::VisitCXXMethodDecl(const CXXMethodDecl *D) {
-  return mapDecl(D);
+  return mapDecl(D, D->isThisDeclarationADefinition());
 }
 
 bool MapASTVisitor::VisitFunctionDecl(const FunctionDecl *D) {
   // Don't visit CXXMethodDecls twice
   if (isa<CXXMethodDecl>(D))
     return true;
-  return mapDecl(D);
+  return mapDecl(D, D->isThisDeclarationADefinition());
 }
 
 bool MapASTVisitor::VisitTypedefDecl(const TypedefDecl *D) {
-  return mapDecl(D);
+  return mapDecl(D, /*isDefinition=*/true);
 }
 
 bool MapASTVisitor::VisitTypeAliasDecl(const TypeAliasDecl *D) {
-  return mapDecl(D);
+  return mapDecl(D, /*isDefinition=*/true);
 }
 
 comments::FullComment *
diff --git a/clang-tools-extra/clang-doc/Mapper.h b/clang-tools-extra/clang-doc/Mapper.h
index cedde935ab743..75c8e947c8f90 100644
--- a/clang-tools-extra/clang-doc/Mapper.h
+++ b/clang-tools-extra/clang-doc/Mapper.h
@@ -43,7 +43,7 @@ class MapASTVisitor : public clang::RecursiveASTVisitor<MapASTVisitor>,
   bool VisitTypeAliasDecl(const TypeAliasDecl *D);
 
 private:
-  template <typename T> bool mapDecl(const T *D);
+  template <typename T> bool mapDecl(const T *D, bool IsDefinition);
 
   int getLine(const NamedDecl *D, const ASTContext &Context) const;
   llvm::SmallString<128> getFile(const NamedDecl *D, const ASTContext &Context,
diff --git a/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp b/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
index 6198a6e0cdcc3..3363cafeded5e 100644
--- a/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
+++ b/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
@@ -303,7 +303,6 @@ Example usage for a project using a compile commands database:
   for (auto &Group : USRToBitcode) {
     Pool.async([&]() {
       std::vector<std::unique_ptr<doc::Info>> Infos;
-
       for (auto &Bitcode : Group.getValue()) {
         llvm::BitstreamCursor Stream(Bitcode);
         doc::ClangDocBitcodeReader Reader(Stream);

>From 26b70707fc2cc0ab8883e6492a4808401a6a4bad Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg at gmail.com>
Date: Thu, 25 Jul 2024 12:22:40 +0530
Subject: [PATCH 094/259] [Clang] Remove some dead code in
 getNumTeamsExprForTargetDirective (#95695)

This was reported in https://pvs-studio.com/en/blog/posts/cpp/1126/,
fragment N9.

V523 The 'then' statement is equivalent to the subsequent code fragment.
CGOpenMPRuntime.cpp:6040, 6036

---------

Co-authored-by: Shivam Gupta <shivma98.tkg at gmail.com>
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index a6a87ec88ee8a..2cef23c733d25 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -6047,11 +6047,6 @@ const Expr *CGOpenMPRuntime::getNumTeamsExprForTargetDirective(
         MinTeamsVal = MaxTeamsVal = 0;
         return nullptr;
       }
-      if (isOpenMPParallelDirective(NestedDir->getDirectiveKind()) ||
-          isOpenMPSimdDirective(NestedDir->getDirectiveKind())) {
-        MinTeamsVal = MaxTeamsVal = 1;
-        return nullptr;
-      }
       MinTeamsVal = MaxTeamsVal = 1;
       return nullptr;
     }

>From 2e0c2154a8dc5d8ab2a8cf360aa0184fb58c0860 Mon Sep 17 00:00:00 2001
From: Freddy Ye <freddy.ye at intel.com>
Date: Thu, 25 Jul 2024 14:55:48 +0800
Subject: [PATCH 095/259] [X86][MC] Support 'ctestX <reg>, <mem>' and 'ctextX
 <mem>, <reg>' as synonyms. (#97985)

relate gas discussion:
https://sourceware.org/pipermail/binutils/2024-July/135349.html
---
 llvm/lib/Target/X86/X86InstrAsmAlias.td |  8 ++++++++
 llvm/test/MC/X86/apx/ctest-att.s        | 15 ++++++++++++++-
 llvm/test/MC/X86/apx/ctest-intel.s      | 12 ++++++++++++
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86InstrAsmAlias.td b/llvm/lib/Target/X86/X86InstrAsmAlias.td
index 493adbb5b9d41..423ee0e8c9bda 100644
--- a/llvm/lib/Target/X86/X86InstrAsmAlias.td
+++ b/llvm/lib/Target/X86/X86InstrAsmAlias.td
@@ -157,6 +157,14 @@ def : InstAlias<"ctest"#Cond#"{l} $dcf\t{$src2, $src1|$src1, $src2}",
                 (CTEST32mr i32mem:$src1, GR32:$src2, cflags:$dcf, CC), 0>;
 def : InstAlias<"ctest"#Cond#"{q} $dcf\t{$src2, $src1|$src1, $src2}",
                 (CTEST64mr i64mem:$src1, GR64:$src2, cflags:$dcf, CC), 0>;
+def : InstAlias<"ctest"#Cond#"{b} $dcf\t{$src1, $src2|$src2, $src1}",
+                (CTEST8mr  i8mem:$src1,  GR8:$src2,  cflags:$dcf, CC), 0>;
+def : InstAlias<"ctest"#Cond#"{w} $dcf\t{$src1, $src2|$src2, $src1}",
+                (CTEST16mr i16mem:$src1, GR16:$src2, cflags:$dcf, CC), 0>;
+def : InstAlias<"ctest"#Cond#"{l} $dcf\t{$src1, $src2|$src2, $src1}",
+                (CTEST32mr i32mem:$src1, GR32:$src2, cflags:$dcf, CC), 0>;
+def : InstAlias<"ctest"#Cond#"{q} $dcf\t{$src1, $src2|$src2, $src1}",
+                (CTEST64mr i64mem:$src1, GR64:$src2, cflags:$dcf, CC), 0>;
 def : InstAlias<"ctest"#Cond#"{b} $dcf\t{$src2, $src1|$src1, $src2}",
                 (CTEST8ri  GR8:$src1,  i8imm:$src2,  cflags:$dcf, CC), 0>;
 def : InstAlias<"ctest"#Cond#"{w} $dcf\t{$src2, $src1|$src1, $src2}",
diff --git a/llvm/test/MC/X86/apx/ctest-att.s b/llvm/test/MC/X86/apx/ctest-att.s
index b9e98adc9841b..809ffc4512da6 100644
--- a/llvm/test/MC/X86/apx/ctest-att.s
+++ b/llvm/test/MC/X86/apx/ctest-att.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
 # RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
 
-# ERROR-COUNT-256: error:
+# ERROR-COUNT-260: error:
 # ERROR-NOT: error:
 # CHECK: ctestbb {dfv=of} $123, 123(%r8,%rax,4)
 # CHECK: encoding: [0x62,0xd4,0x44,0x02,0xf6,0x44,0x80,0x7b,0x7b]
@@ -27,6 +27,19 @@
 # CHECK: ctestbq {dfv=of} %r9, 123(%r8,%rax,4)
 # CHECK: encoding: [0x62,0x54,0xc4,0x02,0x85,0x4c,0x80,0x7b]
          ctestbq {dfv=of} %r9, 123(%r8,%rax,4)
+# Swap mr form
+# CHECK: ctestbb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0x84,0x5c,0x80,0x7b]
+         ctestbb {dfv=of} 123(%r8,%rax,4), %bl
+# CHECK: ctestbw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x02,0x85,0x54,0x80,0x7b]
+         ctestbw {dfv=of} 123(%r8,%rax,4), %dx
+# CHECK: ctestbl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0x85,0x4c,0x80,0x7b]
+         ctestbl {dfv=of} 123(%r8,%rax,4), %ecx
+# CHECK: ctestbq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x02,0x85,0x4c,0x80,0x7b]
+         ctestbq {dfv=of} 123(%r8,%rax,4), %r9
 # CHECK: ctestbb {dfv=of} $123, %bl
 # CHECK: encoding: [0x62,0xf4,0x44,0x02,0xf6,0xc3,0x7b]
          ctestbb {dfv=of} $123, %bl
diff --git a/llvm/test/MC/X86/apx/ctest-intel.s b/llvm/test/MC/X86/apx/ctest-intel.s
index 17cea489b4765..b984163069b7a 100644
--- a/llvm/test/MC/X86/apx/ctest-intel.s
+++ b/llvm/test/MC/X86/apx/ctest-intel.s
@@ -24,6 +24,18 @@
 # CHECK: ctestb {dfv=of} qword ptr [r8 + 4*rax + 123], r9
 # CHECK: encoding: [0x62,0x54,0xc4,0x02,0x85,0x4c,0x80,0x7b]
          ctestb {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestb {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0x84,0x5c,0x80,0x7b]
+         ctestb {dfv=of} bl, byte ptr [r8 + 4*rax + 123]
+# CHECK: ctestb {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x02,0x85,0x54,0x80,0x7b]
+         ctestb {dfv=of} dx, word ptr [r8 + 4*rax + 123]
+# CHECK: ctestb {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0x85,0x4c,0x80,0x7b]
+         ctestb {dfv=of} ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: ctestb {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x02,0x85,0x4c,0x80,0x7b]
+         ctestb {dfv=of} r9, qword ptr [r8 + 4*rax + 123]
 # CHECK: ctestb {dfv=of} bl, 123
 # CHECK: encoding: [0x62,0xf4,0x44,0x02,0xf6,0xc3,0x7b]
          ctestb {dfv=of} bl, 123

>From 74fcb6aafddd56df1bd6d6841b2e0f289f8e54b0 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Wed, 24 Jul 2024 23:57:53 -0700
Subject: [PATCH 096/259] [TargetParser] Fix warnings

This patch fixes:

  llvm/include/llvm/TargetParser/PPCTargetParser.def:109:9: error:
  suggest braces around initialization of subobject
  [-Werror,-Wmissing-braces]

  llvm/lib/TargetParser/PPCTargetParser.cpp:96:16: error: address of
  stack memory associated with local variable 'CPU' returned
  [-Werror,-Wreturn-stack-address]
---
 llvm/lib/TargetParser/PPCTargetParser.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/TargetParser/PPCTargetParser.cpp b/llvm/lib/TargetParser/PPCTargetParser.cpp
index 06a18f54affd8..c773c326f1841 100644
--- a/llvm/lib/TargetParser/PPCTargetParser.cpp
+++ b/llvm/lib/TargetParser/PPCTargetParser.cpp
@@ -26,7 +26,7 @@ struct CPUInfo {
 constexpr CPUInfo PPCCPUInfo[] = {
 #define PPC_CPU(Name, Linux_SUPPORT_METHOD, LinuxID, AIX_SUPPORT_METHOD,       \
                 AIXID)                                                         \
-  Name,
+  {Name},
 #include "llvm/TargetParser/PPCTargetParser.def"
 };
 
@@ -91,7 +91,7 @@ bool isValidCPU(StringRef CPU) {
 StringRef getNormalizedPPCTargetCPU(const Triple &T, StringRef CPUName) {
   if (!CPUName.empty()) {
     if (CPUName == "native") {
-      std::string CPU = std::string(sys::getHostCPUName());
+      StringRef CPU = sys::getHostCPUName();
       if (!CPU.empty() && CPU != "generic")
         return CPU;
     }

>From 2914a4b88837177d4a91a99525c1a3117242236d Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail at bennani.ma>
Date: Thu, 25 Jul 2024 00:11:43 -0700
Subject: [PATCH 097/259] [lldb/Commands] Add `scripting template list` command
 with auto discovery

This patch introduces a new `template` multiword sub-command to the
`scripting` top-level command. As the name suggests, this sub-command
operates on scripting templates, and currently has the ability to
automatically discover the various scripting extensions that lldb
supports.

This was previously reviewed in #97273.

Signed-off-by: Med Ismail Bennani <ismail at bennani.ma>
---
 lldb/include/lldb/Core/PluginManager.h        |  20 +++
 .../Interfaces/ScriptedInterface.h            |   7 +
 .../Interfaces/ScriptedInterfaceUsages.h      |  43 ++++++
 lldb/include/lldb/lldb-private-interfaces.h   |   3 +
 .../Commands/CommandObjectScripting.cpp       | 126 +++++++++++++++++-
 lldb/source/Commands/Options.td               |   6 +
 lldb/source/Core/PluginManager.cpp            |  65 +++++++++
 lldb/source/Interpreter/CMakeLists.txt        |   4 +
 .../Interpreter/Interfaces/CMakeLists.txt     |  10 ++
 .../Interfaces/ScriptedInterfaceUsages.cpp    |  37 +++++
 .../Python/Interfaces/CMakeLists.txt          |  10 +-
 .../CMakeLists.txt                            |  16 +++
 .../OperatingSystemPythonInterface.cpp        |  25 +++-
 .../OperatingSystemPythonInterface.h          |  18 ++-
 .../CMakeLists.txt                            |  16 +++
 .../ScriptedPlatformPythonInterface.cpp       |  24 +++-
 .../ScriptedPlatformPythonInterface.h         |  17 ++-
 .../CMakeLists.txt                            |  16 +++
 .../ScriptedProcessPythonInterface.cpp        |  39 +++++-
 .../ScriptedProcessPythonInterface.h          |  18 ++-
 .../CMakeLists.txt                            |  16 +++
 .../ScriptedThreadPlanPythonInterface.cpp     |  26 +++-
 .../ScriptedThreadPlanPythonInterface.h       |  18 ++-
 .../Python/ScriptInterpreterPython.cpp        |   8 +-
 24 files changed, 548 insertions(+), 40 deletions(-)
 create mode 100644 lldb/include/lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h
 create mode 100644 lldb/source/Interpreter/Interfaces/CMakeLists.txt
 create mode 100644 lldb/source/Interpreter/Interfaces/ScriptedInterfaceUsages.cpp
 create mode 100644 lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/CMakeLists.txt
 rename lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/{ => OperatingSystemPythonInterface}/OperatingSystemPythonInterface.cpp (75%)
 rename lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/{ => OperatingSystemPythonInterface}/OperatingSystemPythonInterface.h (83%)
 create mode 100644 lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/CMakeLists.txt
 rename lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/{ => ScriptedPlatformPythonInterface}/ScriptedPlatformPythonInterface.cpp (83%)
 rename lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/{ => ScriptedPlatformPythonInterface}/ScriptedPlatformPythonInterface.h (84%)
 create mode 100644 lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/CMakeLists.txt
 rename lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/{ => ScriptedProcessPythonInterface}/ScriptedProcessPythonInterface.cpp (85%)
 rename lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/{ => ScriptedProcessPythonInterface}/ScriptedProcessPythonInterface.h (88%)
 create mode 100644 lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/CMakeLists.txt
 rename lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/{ => ScriptedThreadPlanPythonInterface}/ScriptedThreadPlanPythonInterface.cpp (77%)
 rename lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/{ => ScriptedThreadPlanPythonInterface}/ScriptedThreadPlanPythonInterface.h (82%)

diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h
index 38a291d9f0afd..a23f834f471fb 100644
--- a/lldb/include/lldb/Core/PluginManager.h
+++ b/lldb/include/lldb/Core/PluginManager.h
@@ -10,6 +10,7 @@
 #define LLDB_CORE_PLUGINMANAGER_H
 
 #include "lldb/Core/Architecture.h"
+#include "lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h"
 #include "lldb/Symbol/TypeSystem.h"
 #include "lldb/Utility/CompletionRequest.h"
 #include "lldb/Utility/FileSpec.h"
@@ -487,6 +488,25 @@ class PluginManager {
 
   static LanguageSet GetAllTypeSystemSupportedLanguagesForExpressions();
 
+  // Scripted Interface
+  static bool RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
+                             ScriptedInterfaceCreateInstance create_callback,
+                             lldb::ScriptLanguage language,
+                             ScriptedInterfaceUsages usages);
+
+  static bool UnregisterPlugin(ScriptedInterfaceCreateInstance create_callback);
+
+  static uint32_t GetNumScriptedInterfaces();
+
+  static llvm::StringRef GetScriptedInterfaceNameAtIndex(uint32_t idx);
+
+  static llvm::StringRef GetScriptedInterfaceDescriptionAtIndex(uint32_t idx);
+
+  static lldb::ScriptLanguage GetScriptedInterfaceLanguageAtIndex(uint32_t idx);
+
+  static ScriptedInterfaceUsages
+  GetScriptedInterfaceUsagesAtIndex(uint32_t idx);
+
   // REPL
   static bool RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
                              REPLCreateInstance create_callback,
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
index 69504dbcda5dc..3ce47d0584a8a 100644
--- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
@@ -9,6 +9,8 @@
 #ifndef LLDB_INTERPRETER_INTERFACES_SCRIPTEDINTERFACE_H
 #define LLDB_INTERPRETER_INTERFACES_SCRIPTEDINTERFACE_H
 
+#include "ScriptedInterfaceUsages.h"
+
 #include "lldb/Core/StructuredDataImpl.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
@@ -68,6 +70,11 @@ class ScriptedInterface {
     return true;
   }
 
+  static bool CreateInstance(lldb::ScriptLanguage language,
+                             ScriptedInterfaceUsages usages) {
+    return false;
+  }
+
 protected:
   StructuredData::GenericSP m_object_instance_sp;
 };
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h
new file mode 100644
index 0000000000000..36c0cfdca546e
--- /dev/null
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h
@@ -0,0 +1,43 @@
+//===-- ScriptedInterfaceUsages.h ---------------------------- -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_INTERPRETER_SCRIPTEDINTERFACEUSAGES_H
+#define LLDB_INTERPRETER_SCRIPTEDINTERFACEUSAGES_H
+
+#include "lldb/lldb-types.h"
+
+#include "lldb/Utility/Stream.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace lldb_private {
+class ScriptedInterfaceUsages {
+public:
+  ScriptedInterfaceUsages() = default;
+  ScriptedInterfaceUsages(const std::vector<llvm::StringRef> ci_usages,
+                          const std::vector<llvm::StringRef> sbapi_usages)
+      : m_command_interpreter_usages(ci_usages), m_sbapi_usages(sbapi_usages) {}
+
+  const std::vector<llvm::StringRef> &GetCommandInterpreterUsages() const {
+    return m_command_interpreter_usages;
+  }
+
+  const std::vector<llvm::StringRef> &GetSBAPIUsages() const {
+    return m_sbapi_usages;
+  }
+
+  enum class UsageKind { CommandInterpreter, API };
+
+  void Dump(Stream &s, UsageKind kind) const;
+
+private:
+  std::vector<llvm::StringRef> m_command_interpreter_usages;
+  std::vector<llvm::StringRef> m_sbapi_usages;
+};
+} // namespace lldb_private
+
+#endif // LLDB_INTERPRETER_SCRIPTEDINTERFACEUSAGES_H
diff --git a/lldb/include/lldb/lldb-private-interfaces.h b/lldb/include/lldb/lldb-private-interfaces.h
index 10eaf1e6a5add..87c5ff8d22fb6 100644
--- a/lldb/include/lldb/lldb-private-interfaces.h
+++ b/lldb/include/lldb/lldb-private-interfaces.h
@@ -25,6 +25,7 @@ class Value;
 } // namespace llvm
 
 namespace lldb_private {
+class ScriptedInterfaceUsages;
 typedef lldb::ABISP (*ABICreateInstance)(lldb::ProcessSP process_sp,
                                          const ArchSpec &arch);
 typedef std::unique_ptr<Architecture> (*ArchitectureCreateInstance)(
@@ -124,6 +125,8 @@ typedef lldb::REPLSP (*REPLCreateInstance)(Status &error,
                                            lldb::LanguageType language,
                                            Debugger *debugger, Target *target,
                                            const char *repl_options);
+typedef bool (*ScriptedInterfaceCreateInstance)(lldb::ScriptLanguage language,
+                                                ScriptedInterfaceUsages usages);
 typedef int (*ComparisonFunction)(const void *, const void *);
 typedef void (*DebuggerInitializeCallback)(Debugger &debugger);
 /// Trace
diff --git a/lldb/source/Commands/CommandObjectScripting.cpp b/lldb/source/Commands/CommandObjectScripting.cpp
index fee0565a7c48a..730a190a6e891 100644
--- a/lldb/source/Commands/CommandObjectScripting.cpp
+++ b/lldb/source/Commands/CommandObjectScripting.cpp
@@ -8,12 +8,14 @@
 
 #include "CommandObjectScripting.h"
 #include "lldb/Core/Debugger.h"
+#include "lldb/Core/PluginManager.h"
 #include "lldb/DataFormatters/DataVisualization.h"
 #include "lldb/Host/Config.h"
 #include "lldb/Host/OptionParser.h"
 #include "lldb/Interpreter/CommandInterpreter.h"
 #include "lldb/Interpreter/CommandOptionArgumentTable.h"
 #include "lldb/Interpreter/CommandReturnObject.h"
+#include "lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h"
 #include "lldb/Interpreter/OptionArgParser.h"
 #include "lldb/Interpreter/ScriptInterpreter.h"
 #include "lldb/Utility/Args.h"
@@ -127,9 +129,126 @@ class CommandObjectScriptingRun : public CommandObjectRaw {
   CommandOptions m_options;
 };
 
-#pragma mark CommandObjectMultiwordScripting
+#define LLDB_OPTIONS_scripting_template_list
+#include "CommandOptions.inc"
+
+class CommandObjectScriptingTemplateList : public CommandObjectParsed {
+public:
+  CommandObjectScriptingTemplateList(CommandInterpreter &interpreter)
+      : CommandObjectParsed(
+            interpreter, "scripting template list",
+            "List all the available scripting extension templates. ",
+            "scripting template list [--language <scripting-language> --]") {}
+
+  ~CommandObjectScriptingTemplateList() override = default;
+
+  Options *GetOptions() override { return &m_options; }
+
+  class CommandOptions : public Options {
+  public:
+    CommandOptions() = default;
+    ~CommandOptions() override = default;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
+      const int short_option = m_getopt_table[option_idx].val;
 
-// CommandObjectMultiwordScripting
+      switch (short_option) {
+      case 'l':
+        m_language = (lldb::ScriptLanguage)OptionArgParser::ToOptionEnum(
+            option_arg, GetDefinitions()[option_idx].enum_values,
+            eScriptLanguageNone, error);
+        if (!error.Success())
+          error.SetErrorStringWithFormatv(
+              "unrecognized value for language '{0}'", option_arg);
+        break;
+      default:
+        llvm_unreachable("Unimplemented option");
+      }
+
+      return error;
+    }
+
+    void OptionParsingStarting(ExecutionContext *execution_context) override {
+      m_language = lldb::eScriptLanguageDefault;
+    }
+
+    llvm::ArrayRef<OptionDefinition> GetDefinitions() override {
+      return llvm::ArrayRef(g_scripting_template_list_options);
+    }
+
+    lldb::ScriptLanguage m_language = lldb::eScriptLanguageDefault;
+  };
+
+protected:
+  void DoExecute(Args &command, CommandReturnObject &result) override {
+    Stream &s = result.GetOutputStream();
+    s.Printf("Available scripted extension templates:");
+
+    auto print_field = [&s](llvm::StringRef key, llvm::StringRef value) {
+      if (!value.empty()) {
+        s.IndentMore();
+        s.Indent();
+        s << key << ": " << value << '\n';
+        s.IndentLess();
+      }
+    };
+
+    size_t num_listed_interface = 0;
+    size_t num_templates = PluginManager::GetNumScriptedInterfaces();
+    for (size_t i = 0; i < num_templates; i++) {
+      llvm::StringRef plugin_name =
+          PluginManager::GetScriptedInterfaceNameAtIndex(i);
+      if (plugin_name.empty())
+        break;
+
+      lldb::ScriptLanguage lang =
+          PluginManager::GetScriptedInterfaceLanguageAtIndex(i);
+      if (lang != m_options.m_language)
+        continue;
+
+      if (!num_listed_interface)
+        s.EOL();
+
+      num_listed_interface++;
+
+      llvm::StringRef desc =
+          PluginManager::GetScriptedInterfaceDescriptionAtIndex(i);
+      ScriptedInterfaceUsages usages =
+          PluginManager::GetScriptedInterfaceUsagesAtIndex(i);
+
+      print_field("Name", plugin_name);
+      print_field("Language", ScriptInterpreter::LanguageToString(lang));
+      print_field("Description", desc);
+      usages.Dump(s, ScriptedInterfaceUsages::UsageKind::API);
+      usages.Dump(s, ScriptedInterfaceUsages::UsageKind::CommandInterpreter);
+
+      if (i != num_templates - 1)
+        s.EOL();
+    }
+
+    if (!num_listed_interface)
+      s << " None\n";
+  }
+
+private:
+  CommandOptions m_options;
+};
+
+class CommandObjectMultiwordScriptingTemplate : public CommandObjectMultiword {
+public:
+  CommandObjectMultiwordScriptingTemplate(CommandInterpreter &interpreter)
+      : CommandObjectMultiword(
+            interpreter, "scripting template",
+            "Commands for operating on the scripting templates.",
+            "scripting template [<subcommand-options>]") {
+    LoadSubCommand(
+        "list",
+        CommandObjectSP(new CommandObjectScriptingTemplateList(interpreter)));
+  }
+
+  ~CommandObjectMultiwordScriptingTemplate() override = default;
+};
 
 CommandObjectMultiwordScripting::CommandObjectMultiwordScripting(
     CommandInterpreter &interpreter)
@@ -139,6 +258,9 @@ CommandObjectMultiwordScripting::CommandObjectMultiwordScripting(
           "scripting <subcommand> [<subcommand-options>]") {
   LoadSubCommand("run",
                  CommandObjectSP(new CommandObjectScriptingRun(interpreter)));
+  LoadSubCommand("template",
+                 CommandObjectSP(
+                     new CommandObjectMultiwordScriptingTemplate(interpreter)));
 }
 
 CommandObjectMultiwordScripting::~CommandObjectMultiwordScripting() = default;
diff --git a/lldb/source/Commands/Options.td b/lldb/source/Commands/Options.td
index 24e97f3bb97d3..6e5ed21b22ad8 100644
--- a/lldb/source/Commands/Options.td
+++ b/lldb/source/Commands/Options.td
@@ -841,6 +841,12 @@ let Command = "scripting run" in {
     " language. If none is specific the default scripting language is used.">;
 }
 
+let Command = "scripting template list" in {
+  def scripting_template_list_language : Option<"language", "l">,
+    EnumArg<"ScriptLang">, Desc<"Specify the scripting "
+    " language. If none is specified the default scripting language is used.">;
+}
+
 let Command = "source info" in {
   def source_info_count : Option<"count", "c">, Arg<"Count">,
     Desc<"The number of line entries to display.">;
diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp
index 759ef3a8afe02..01bee8680b7ba 100644
--- a/lldb/source/Core/PluginManager.cpp
+++ b/lldb/source/Core/PluginManager.cpp
@@ -1505,6 +1505,70 @@ LanguageSet PluginManager::GetAllTypeSystemSupportedLanguagesForExpressions() {
   return all;
 }
 
+#pragma mark ScriptedInterfaces
+
+struct ScriptedInterfaceInstance
+    : public PluginInstance<ScriptedInterfaceCreateInstance> {
+  ScriptedInterfaceInstance(llvm::StringRef name, llvm::StringRef description,
+                            ScriptedInterfaceCreateInstance create_callback,
+                            lldb::ScriptLanguage language,
+                            ScriptedInterfaceUsages usages)
+      : PluginInstance<ScriptedInterfaceCreateInstance>(name, description,
+                                                        create_callback),
+        language(language), usages(usages) {}
+
+  lldb::ScriptLanguage language;
+  ScriptedInterfaceUsages usages;
+};
+
+typedef PluginInstances<ScriptedInterfaceInstance> ScriptedInterfaceInstances;
+
+static ScriptedInterfaceInstances &GetScriptedInterfaceInstances() {
+  static ScriptedInterfaceInstances g_instances;
+  return g_instances;
+}
+
+bool PluginManager::RegisterPlugin(
+    llvm::StringRef name, llvm::StringRef description,
+    ScriptedInterfaceCreateInstance create_callback,
+    lldb::ScriptLanguage language, ScriptedInterfaceUsages usages) {
+  return GetScriptedInterfaceInstances().RegisterPlugin(
+      name, description, create_callback, language, usages);
+}
+
+bool PluginManager::UnregisterPlugin(
+    ScriptedInterfaceCreateInstance create_callback) {
+  return GetScriptedInterfaceInstances().UnregisterPlugin(create_callback);
+}
+
+uint32_t PluginManager::GetNumScriptedInterfaces() {
+  return GetScriptedInterfaceInstances().GetInstances().size();
+}
+
+llvm::StringRef PluginManager::GetScriptedInterfaceNameAtIndex(uint32_t index) {
+  return GetScriptedInterfaceInstances().GetNameAtIndex(index);
+}
+
+llvm::StringRef
+PluginManager::GetScriptedInterfaceDescriptionAtIndex(uint32_t index) {
+  return GetScriptedInterfaceInstances().GetDescriptionAtIndex(index);
+}
+
+lldb::ScriptLanguage
+PluginManager::GetScriptedInterfaceLanguageAtIndex(uint32_t idx) {
+  const auto &instances = GetScriptedInterfaceInstances().GetInstances();
+  return idx < instances.size() ? instances[idx].language
+                                : ScriptLanguage::eScriptLanguageNone;
+}
+
+ScriptedInterfaceUsages
+PluginManager::GetScriptedInterfaceUsagesAtIndex(uint32_t idx) {
+  const auto &instances = GetScriptedInterfaceInstances().GetInstances();
+  if (idx >= instances.size())
+    return {};
+  return instances[idx].usages;
+}
+
 #pragma mark REPL
 
 struct REPLInstance : public PluginInstance<REPLCreateInstance> {
@@ -1565,6 +1629,7 @@ void PluginManager::DebuggerInitialize(Debugger &debugger) {
   GetOperatingSystemInstances().PerformDebuggerCallback(debugger);
   GetStructuredDataPluginInstances().PerformDebuggerCallback(debugger);
   GetTracePluginInstances().PerformDebuggerCallback(debugger);
+  GetScriptedInterfaceInstances().PerformDebuggerCallback(debugger);
 }
 
 // This is the preferred new way to register plugin specific settings.  e.g.
diff --git a/lldb/source/Interpreter/CMakeLists.txt b/lldb/source/Interpreter/CMakeLists.txt
index ae79b82d7c3e2..642263a8bda7f 100644
--- a/lldb/source/Interpreter/CMakeLists.txt
+++ b/lldb/source/Interpreter/CMakeLists.txt
@@ -6,6 +6,8 @@ lldb_tablegen(InterpreterPropertiesEnum.inc -gen-lldb-property-enum-defs
   SOURCE InterpreterProperties.td
   TARGET LLDBInterpreterPropertiesEnumGen)
 
+add_subdirectory(Interfaces)
+
 add_lldb_library(lldbInterpreter NO_PLUGIN_DEPENDENCIES
   CommandAlias.cpp
   CommandHistory.cpp
@@ -54,6 +56,7 @@ add_lldb_library(lldbInterpreter NO_PLUGIN_DEPENDENCIES
   ScriptInterpreter.cpp
 
   LINK_LIBS
+    lldbInterpreterInterfaces
     lldbCommands
     lldbCore
     lldbDataFormatters
@@ -66,6 +69,7 @@ add_lldb_library(lldbInterpreter NO_PLUGIN_DEPENDENCIES
   )
 
 add_dependencies(lldbInterpreter
+  lldbInterpreterInterfaces
   LLDBInterpreterPropertiesGen
   LLDBInterpreterPropertiesEnumGen)
 
diff --git a/lldb/source/Interpreter/Interfaces/CMakeLists.txt b/lldb/source/Interpreter/Interfaces/CMakeLists.txt
new file mode 100644
index 0000000000000..f44672aa50b75
--- /dev/null
+++ b/lldb/source/Interpreter/Interfaces/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_lldb_library(lldbInterpreterInterfaces NO_PLUGIN_DEPENDENCIES
+  ScriptedInterfaceUsages.cpp
+
+  LINK_LIBS
+    lldbUtility
+
+  LINK_COMPONENTS
+    Support
+  )
+
diff --git a/lldb/source/Interpreter/Interfaces/ScriptedInterfaceUsages.cpp b/lldb/source/Interpreter/Interfaces/ScriptedInterfaceUsages.cpp
new file mode 100644
index 0000000000000..05d7a5d852f8c
--- /dev/null
+++ b/lldb/source/Interpreter/Interfaces/ScriptedInterfaceUsages.cpp
@@ -0,0 +1,37 @@
+//===-- ScriptedInterfaceUsages.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+void ScriptedInterfaceUsages::Dump(Stream &s, UsageKind kind) const {
+  s.IndentMore();
+  s.Indent();
+  llvm::StringRef usage_kind =
+      (kind == UsageKind::CommandInterpreter) ? "Command Interpreter" : "API";
+  s << usage_kind << " Usages:";
+  const std::vector<llvm::StringRef> &usages =
+      (kind == UsageKind::CommandInterpreter) ? GetCommandInterpreterUsages()
+                                              : GetSBAPIUsages();
+  if (usages.empty())
+    s << " None\n";
+  else if (usages.size() == 1)
+    s << " " << usages.front() << '\n';
+  else {
+    s << '\n';
+    for (llvm::StringRef usage : usages) {
+      s.IndentMore();
+      s.Indent();
+      s << usage << '\n';
+      s.IndentLess();
+    }
+  }
+  s.IndentLess();
+}
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
index c60e4bb503a37..eb22a960b5345 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
@@ -20,12 +20,8 @@ if (LLDB_ENABLE_LIBEDIT)
 endif()
 
 add_lldb_library(lldbPluginScriptInterpreterPythonInterfaces
-  OperatingSystemPythonInterface.cpp
   ScriptedPythonInterface.cpp
-  ScriptedProcessPythonInterface.cpp
   ScriptedThreadPythonInterface.cpp
-  ScriptedThreadPlanPythonInterface.cpp
-  ScriptedPlatformPythonInterface.cpp
 
   LINK_LIBS
     lldbCore
@@ -38,3 +34,9 @@ add_lldb_library(lldbPluginScriptInterpreterPythonInterfaces
   LINK_COMPONENTS
     Support
   )
+
+add_subdirectory(OperatingSystemPythonInterface)
+add_subdirectory(ScriptedPlatformPythonInterface)
+add_subdirectory(ScriptedProcessPythonInterface)
+add_subdirectory(ScriptedThreadPlanPythonInterface)
+
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/CMakeLists.txt
new file mode 100644
index 0000000000000..b48f1e818e5d5
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_lldb_library(lldbPluginScriptInterpreterPythonOperatingSystemPythonInterface PLUGIN
+
+  OperatingSystemPythonInterface.cpp
+
+  LINK_LIBS
+    lldbCore
+    lldbHost
+    lldbInterpreter
+    lldbTarget
+    lldbPluginScriptInterpreterPython
+    ${Python3_LIBRARIES}
+    ${LLDB_LIBEDIT_LIBS}
+
+  LINK_COMPONENTS
+    Support
+  )
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.cpp
similarity index 75%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.cpp
index c162c7367c654..019db269a905b 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "lldb/Core/PluginManager.h"
 #include "lldb/Host/Config.h"
 #include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/Log.h"
@@ -13,11 +14,13 @@
 
 #if LLDB_ENABLE_PYTHON
 
+// clang-format off
 // LLDB Python header must be included first
-#include "../lldb-python.h"
+#include "../../lldb-python.h"
+//clang-format on
 
-#include "../SWIGPythonBridge.h"
-#include "../ScriptInterpreterPythonImpl.h"
+#include "../../SWIGPythonBridge.h"
+#include "../../ScriptInterpreterPythonImpl.h"
 #include "OperatingSystemPythonInterface.h"
 
 using namespace lldb;
@@ -25,6 +28,8 @@ using namespace lldb_private;
 using namespace lldb_private::python;
 using Locker = ScriptInterpreterPythonImpl::Locker;
 
+LLDB_PLUGIN_DEFINE_ADV(OperatingSystemPythonInterface, ScriptInterpreterPythonOperatingSystemPythonInterface)
+
 OperatingSystemPythonInterface::OperatingSystemPythonInterface(
     ScriptInterpreterPythonImpl &interpreter)
     : OperatingSystemInterface(), ScriptedThreadPythonInterface(interpreter) {}
@@ -79,4 +84,18 @@ OperatingSystemPythonInterface::GetRegisterContextForTID(lldb::tid_t tid) {
   return obj->GetAsString()->GetValue().str();
 }
 
+void OperatingSystemPythonInterface::Initialize() {
+  const std::vector<llvm::StringRef> ci_usages = {
+      "settings set target.process.python-os-plugin-path <script-path>",
+      "settings set process.experimental.os-plugin-reports-all-threads [0/1]"};
+  const std::vector<llvm::StringRef> api_usages = {};
+  PluginManager::RegisterPlugin(
+      GetPluginNameStatic(), llvm::StringRef("Mock thread state"),
+      CreateInstance, eScriptLanguagePython, {ci_usages, api_usages});
+}
+
+void OperatingSystemPythonInterface::Terminate() {
+  PluginManager::UnregisterPlugin(CreateInstance);
+}
+
 #endif
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h
similarity index 83%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.h
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h
index da7bbf13b1d55..6d60f8b437d1c 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h
@@ -10,17 +10,19 @@
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_OPERATINGSYSTEMPYTHONINTERFACE_H
 
 #include "lldb/Host/Config.h"
+#include "lldb/Interpreter/Interfaces/OperatingSystemInterface.h"
 
 #if LLDB_ENABLE_PYTHON
 
-#include "ScriptedThreadPythonInterface.h"
-#include "lldb/Interpreter/Interfaces/OperatingSystemInterface.h"
+#include "../ScriptedThreadPythonInterface.h"
+
 #include <optional>
 
 namespace lldb_private {
 class OperatingSystemPythonInterface
     : virtual public OperatingSystemInterface,
-      virtual public ScriptedThreadPythonInterface {
+      virtual public ScriptedThreadPythonInterface,
+      public PluginInterface {
 public:
   OperatingSystemPythonInterface(ScriptInterpreterPythonImpl &interpreter);
 
@@ -41,6 +43,16 @@ class OperatingSystemPythonInterface
   StructuredData::DictionarySP GetRegisterInfo() override;
 
   std::optional<std::string> GetRegisterContextForTID(lldb::tid_t tid) override;
+
+  static void Initialize();
+
+  static void Terminate();
+
+  static llvm::StringRef GetPluginNameStatic() {
+    return "OperatingSystemPythonInterface";
+  }
+
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
 };
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/CMakeLists.txt
new file mode 100644
index 0000000000000..ae5e525229c02
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_lldb_library(lldbPluginScriptInterpreterPythonScriptedPlatformPythonInterface PLUGIN
+
+  ScriptedPlatformPythonInterface.cpp
+
+  LINK_LIBS
+    lldbCore
+    lldbHost
+    lldbInterpreter
+    lldbTarget
+    lldbPluginScriptInterpreterPython
+    ${Python3_LIBRARIES}
+    ${LLDB_LIBEDIT_LIBS}
+
+  LINK_COMPONENTS
+    Support
+  )
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.cpp
similarity index 83%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.cpp
index 6e93bec80056e..3586251bd4aac 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.cpp
@@ -6,27 +6,31 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "lldb/Core/PluginManager.h"
 #include "lldb/Host/Config.h"
+#include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/Status.h"
 #include "lldb/lldb-enumerations.h"
 
 #if LLDB_ENABLE_PYTHON
 
+// clang-format off
 // LLDB Python header must be included first
-#include "../lldb-python.h"
+#include "../../lldb-python.h"
+//clang-format on
 
-#include "../SWIGPythonBridge.h"
-#include "../ScriptInterpreterPythonImpl.h"
+#include "../../SWIGPythonBridge.h"
+#include "../../ScriptInterpreterPythonImpl.h"
 #include "ScriptedPlatformPythonInterface.h"
 
-#include "lldb/Target/ExecutionContext.h"
-
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::python;
 using Locker = ScriptInterpreterPythonImpl::Locker;
 
+LLDB_PLUGIN_DEFINE_ADV(ScriptedPlatformPythonInterface, ScriptInterpreterPythonScriptedPlatformPythonInterface)
+
 ScriptedPlatformPythonInterface::ScriptedPlatformPythonInterface(
     ScriptInterpreterPythonImpl &interpreter)
     : ScriptedPlatformInterface(), ScriptedPythonInterface(interpreter) {}
@@ -93,4 +97,14 @@ Status ScriptedPlatformPythonInterface::KillProcess(lldb::pid_t pid) {
   return GetStatusFromMethod("kill_process", pid);
 }
 
+void ScriptedPlatformPythonInterface::Initialize() {
+  PluginManager::RegisterPlugin(
+      GetPluginNameStatic(), "Mock platform and interact with its processes.",
+      CreateInstance, eScriptLanguagePython, {});
+}
+
+void ScriptedPlatformPythonInterface::Terminate() {
+  PluginManager::UnregisterPlugin(CreateInstance);
+}
+
 #endif // LLDB_ENABLE_PYTHON
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h
similarity index 84%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.h
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h
index 0842d3a003429..01ee40a5a197c 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h
@@ -10,15 +10,16 @@
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDPLATFORMPYTHONINTERFACE_H
 
 #include "lldb/Host/Config.h"
+#include "lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h"
 
 #if LLDB_ENABLE_PYTHON
 
-#include "ScriptedPythonInterface.h"
-#include "lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h"
+#include "../ScriptedPythonInterface.h"
 
 namespace lldb_private {
 class ScriptedPlatformPythonInterface : public ScriptedPlatformInterface,
-                                        public ScriptedPythonInterface {
+                                        public ScriptedPythonInterface,
+                                        public PluginInterface {
 public:
   ScriptedPlatformPythonInterface(ScriptInterpreterPythonImpl &interpreter);
 
@@ -43,6 +44,16 @@ class ScriptedPlatformPythonInterface : public ScriptedPlatformInterface,
   Status LaunchProcess(lldb::ProcessLaunchInfoSP launch_info) override;
 
   Status KillProcess(lldb::pid_t pid) override;
+
+  static void Initialize();
+
+  static void Terminate();
+
+  static llvm::StringRef GetPluginNameStatic() {
+    return "ScriptedPlatformPythonInterface";
+  }
+
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
 };
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/CMakeLists.txt
new file mode 100644
index 0000000000000..66ed041853f67
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_lldb_library(lldbPluginScriptInterpreterPythonScriptedProcessPythonInterface PLUGIN
+
+  ScriptedProcessPythonInterface.cpp
+
+  LINK_LIBS
+    lldbCore
+    lldbHost
+    lldbInterpreter
+    lldbTarget
+    lldbPluginScriptInterpreterPython
+    ${Python3_LIBRARIES}
+    ${LLDB_LIBEDIT_LIBS}
+
+  LINK_COMPONENTS
+    Support
+  )
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.cpp
similarity index 85%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.cpp
index 313c597ce48f3..f4fba0848fe27 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.cpp
@@ -6,11 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "lldb/Core/PluginManager.h"
 #include "lldb/Host/Config.h"
-#if LLDB_ENABLE_PYTHON
-// LLDB Python header must be included first
-#include "../lldb-python.h"
-#endif
 #include "lldb/Target/Process.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/Status.h"
@@ -18,10 +15,16 @@
 
 #if LLDB_ENABLE_PYTHON
 
-#include "../SWIGPythonBridge.h"
-#include "../ScriptInterpreterPythonImpl.h"
+// clang-format off
+// LLDB Python header must be included first
+#include "../../lldb-python.h"
+//clang-format on
+
+#include "../../SWIGPythonBridge.h"
+#include "../../ScriptInterpreterPythonImpl.h"
+#include "../ScriptedThreadPythonInterface.h"
 #include "ScriptedProcessPythonInterface.h"
-#include "ScriptedThreadPythonInterface.h"
+
 #include <optional>
 
 using namespace lldb;
@@ -29,6 +32,8 @@ using namespace lldb_private;
 using namespace lldb_private::python;
 using Locker = ScriptInterpreterPythonImpl::Locker;
 
+LLDB_PLUGIN_DEFINE_ADV(ScriptedProcessPythonInterface, ScriptInterpreterPythonScriptedProcessPythonInterface)
+
 ScriptedProcessPythonInterface::ScriptedProcessPythonInterface(
     ScriptInterpreterPythonImpl &interpreter)
     : ScriptedProcessInterface(), ScriptedPythonInterface(interpreter) {}
@@ -208,4 +213,24 @@ StructuredData::DictionarySP ScriptedProcessPythonInterface::GetMetadata() {
   return dict;
 }
 
+void ScriptedProcessPythonInterface::Initialize() {
+  const std::vector<llvm::StringRef> ci_usages = {
+      "process attach -C <script-name> [-k key -v value ...]",
+      "process launch -C <script-name> [-k key -v value ...]"};
+  const std::vector<llvm::StringRef> api_usages = {
+      "SBAttachInfo.SetScriptedProcessClassName",
+      "SBAttachInfo.SetScriptedProcessDictionary",
+      "SBTarget.Attach",
+      "SBLaunchInfo.SetScriptedProcessClassName",
+      "SBLaunchInfo.SetScriptedProcessDictionary",
+      "SBTarget.Launch"};
+  PluginManager::RegisterPlugin(
+      GetPluginNameStatic(), llvm::StringRef("Mock process state"),
+      CreateInstance, eScriptLanguagePython, {ci_usages, api_usages});
+}
+
+void ScriptedProcessPythonInterface::Terminate() {
+  PluginManager::UnregisterPlugin(CreateInstance);
+}
+
 #endif
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.h
similarity index 88%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.h
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.h
index c75caa9340f25..bb27734739f43 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.h
@@ -10,16 +10,18 @@
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDPROCESSPYTHONINTERFACE_H
 
 #include "lldb/Host/Config.h"
+#include "lldb/Interpreter/Interfaces/ScriptedProcessInterface.h"
 
 #if LLDB_ENABLE_PYTHON
 
-#include "ScriptedPythonInterface.h"
-#include "lldb/Interpreter/Interfaces/ScriptedProcessInterface.h"
+#include "../ScriptedPythonInterface.h"
+
 #include <optional>
 
 namespace lldb_private {
 class ScriptedProcessPythonInterface : public ScriptedProcessInterface,
-                                       public ScriptedPythonInterface {
+                                       public ScriptedPythonInterface,
+                                       public PluginInterface {
 public:
   ScriptedProcessPythonInterface(ScriptInterpreterPythonImpl &interpreter);
 
@@ -67,6 +69,16 @@ class ScriptedProcessPythonInterface : public ScriptedProcessInterface,
 
   StructuredData::DictionarySP GetMetadata() override;
 
+  static void Initialize();
+
+  static void Terminate();
+
+  static llvm::StringRef GetPluginNameStatic() {
+    return "ScriptedProcessPythonInterface";
+  }
+
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
+
 private:
   lldb::ScriptedThreadInterfaceSP CreateScriptedThreadInterface() override;
 };
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/CMakeLists.txt
new file mode 100644
index 0000000000000..db41da165d275
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_lldb_library(lldbPluginScriptInterpreterPythonScriptedThreadPlanPythonInterface PLUGIN
+
+  ScriptedThreadPlanPythonInterface.cpp
+
+  LINK_LIBS
+    lldbCore
+    lldbHost
+    lldbInterpreter
+    lldbTarget
+    lldbPluginScriptInterpreterPython
+    ${Python3_LIBRARIES}
+    ${LLDB_LIBEDIT_LIBS}
+
+  LINK_COMPONENTS
+    Support
+  )
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.cpp
similarity index 77%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.cpp
index f23858c01277c..5f1c7da71bd52 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.cpp
@@ -6,23 +6,28 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "lldb/Core/PluginManager.h"
 #include "lldb/Host/Config.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/lldb-enumerations.h"
 
 #if LLDB_ENABLE_PYTHON
 
+// clang-format off
 // LLDB Python header must be included first
-#include "../lldb-python.h"
+#include "../../lldb-python.h"
+//clang-format on
 
-#include "../SWIGPythonBridge.h"
-#include "../ScriptInterpreterPythonImpl.h"
+#include "../../SWIGPythonBridge.h"
+#include "../../ScriptInterpreterPythonImpl.h"
 #include "ScriptedThreadPlanPythonInterface.h"
 
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::python;
 
+LLDB_PLUGIN_DEFINE_ADV(ScriptedThreadPlanPythonInterface, ScriptInterpreterPythonScriptedThreadPlanPythonInterface)
+
 ScriptedThreadPlanPythonInterface::ScriptedThreadPlanPythonInterface(
     ScriptInterpreterPythonImpl &interpreter)
     : ScriptedThreadPlanInterface(), ScriptedPythonInterface(interpreter) {}
@@ -102,4 +107,19 @@ ScriptedThreadPlanPythonInterface::GetStopDescription(lldb::StreamSP &stream) {
   return llvm::Error::success();
 }
 
+void ScriptedThreadPlanPythonInterface::Initialize() {
+  const std::vector<llvm::StringRef> ci_usages = {
+      "thread step-scripted -C <script-name> [-k key -v value ...]"};
+  const std::vector<llvm::StringRef> api_usages = {
+      "SBThread.StepUsingScriptedThreadPlan"};
+  PluginManager::RegisterPlugin(
+      GetPluginNameStatic(),
+      llvm::StringRef("Alter thread stepping logic and stop reason"),
+      CreateInstance, eScriptLanguagePython, {ci_usages, api_usages});
+}
+
+void ScriptedThreadPlanPythonInterface::Terminate() {
+  PluginManager::UnregisterPlugin(CreateInstance);
+}
+
 #endif
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.h
similarity index 82%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.h
index 6ec89b9f59253..c0a82f4cbf46a 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.h
@@ -10,16 +10,18 @@
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H
 
 #include "lldb/Host/Config.h"
+#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h"
 
 #if LLDB_ENABLE_PYTHON
 
-#include "ScriptedPythonInterface.h"
-#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h"
+#include "../ScriptedPythonInterface.h"
+
 #include <optional>
 
 namespace lldb_private {
 class ScriptedThreadPlanPythonInterface : public ScriptedThreadPlanInterface,
-                                          public ScriptedPythonInterface {
+                                          public ScriptedPythonInterface,
+                                          public PluginInterface {
 public:
   ScriptedThreadPlanPythonInterface(ScriptInterpreterPythonImpl &interpreter);
 
@@ -41,6 +43,16 @@ class ScriptedThreadPlanPythonInterface : public ScriptedThreadPlanInterface,
   lldb::StateType GetRunState() override;
 
   llvm::Error GetStopDescription(lldb::StreamSP &stream) override;
+
+  static void Initialize();
+
+  static void Terminate();
+
+  static llvm::StringRef GetPluginNameStatic() {
+    return "ScriptedThreadPlanPythonInterface";
+  }
+
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
 };
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index 70fa6d83e306f..b1a69c9207649 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -14,10 +14,10 @@
 // LLDB Python header must be included first
 #include "lldb-python.h"
 
-#include "Interfaces/OperatingSystemPythonInterface.h"
-#include "Interfaces/ScriptedPlatformPythonInterface.h"
-#include "Interfaces/ScriptedProcessPythonInterface.h"
-#include "Interfaces/ScriptedThreadPlanPythonInterface.h"
+#include "Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h"
+#include "Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h"
+#include "Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.h"
+#include "Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.h"
 #include "Interfaces/ScriptedThreadPythonInterface.h"
 #include "PythonDataObjects.h"
 #include "PythonReadline.h"

>From 693d757b63e5020e0fa78bb71fc16acdad5f8232 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Thu, 25 Jul 2024 00:13:00 -0700
Subject: [PATCH 098/259] Internalize a clang Sarif function

---
 clang/lib/Basic/Sarif.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Basic/Sarif.cpp b/clang/lib/Basic/Sarif.cpp
index 8c144df341673..980b0ae40ae22 100644
--- a/clang/lib/Basic/Sarif.cpp
+++ b/clang/lib/Basic/Sarif.cpp
@@ -141,7 +141,7 @@ static unsigned int adjustColumnPos(FullSourceLoc Loc,
 /// @{
 
 /// \internal
-json::Object createMessage(StringRef Text) {
+static json::Object createMessage(StringRef Text) {
   return json::Object{{"text", Text.str()}};
 }
 

>From 6d12b3f67df429bffff6e1953d9f55867d7e2469 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 25 Jul 2024 15:14:39 +0800
Subject: [PATCH 099/259] [VP] Refactor VectorBuilder to avoid layering
 violation. NFC (#99276)

This patch refactors the handling of reduction to eliminate layering
violations.

* Introduced `getReductionIntrinsicID` in LoopUtils.h for mapping
recurrence kinds to llvm.vector.reduce.* intrinsic IDs.
* Updated `VectorBuilder::createSimpleTargetReduction` to accept
llvm.vector.reduce.* intrinsic directly.
* New function `VPIntrinsic::getForIntrinsic` for mapping intrinsic ID
to the same functional VP intrinsic ID.
---
 llvm/include/llvm/IR/IntrinsicInst.h          |  4 ++
 llvm/include/llvm/IR/VectorBuilder.h          |  5 +-
 .../include/llvm/Transforms/Utils/LoopUtils.h |  4 ++
 llvm/lib/IR/IntrinsicInst.cpp                 | 19 +++++++
 llvm/lib/IR/VectorBuilder.cpp                 | 57 ++-----------------
 llvm/lib/Transforms/Utils/LoopUtils.cpp       | 44 +++++++++++++-
 llvm/unittests/IR/VPIntrinsicTest.cpp         | 53 +++++++++++++++++
 7 files changed, 129 insertions(+), 57 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index fe3f92da400f8..94c8fa092f45e 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -569,6 +569,10 @@ class VPIntrinsic : public IntrinsicInst {
   /// The llvm.vp.* intrinsics for this instruction Opcode
   static Intrinsic::ID getForOpcode(unsigned OC);
 
+  /// The llvm.vp.* intrinsics for this intrinsic ID \p Id. Return \p Id if it
+  /// is already a VP intrinsic.
+  static Intrinsic::ID getForIntrinsic(Intrinsic::ID Id);
+
   // Whether \p ID is a VP intrinsic ID.
   static bool isVPIntrinsic(Intrinsic::ID);
 
diff --git a/llvm/include/llvm/IR/VectorBuilder.h b/llvm/include/llvm/IR/VectorBuilder.h
index 6af7f6075551d..dbb9f4c7336d5 100644
--- a/llvm/include/llvm/IR/VectorBuilder.h
+++ b/llvm/include/llvm/IR/VectorBuilder.h
@@ -15,7 +15,6 @@
 #ifndef LLVM_IR_VECTORBUILDER_H
 #define LLVM_IR_VECTORBUILDER_H
 
-#include <llvm/Analysis/IVDescriptors.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/InstrTypes.h>
 #include <llvm/IR/Instruction.h>
@@ -100,11 +99,11 @@ class VectorBuilder {
                                  const Twine &Name = Twine());
 
   /// Emit a VP reduction intrinsic call for recurrence kind.
-  /// \param Kind        The kind of recurrence
+  /// \param RdxID       The intrinsic ID of llvm.vector.reduce.*
   /// \param ValTy       The type of operand which the reduction operation is
   ///                    performed.
   /// \param VecOpArray  The operand list.
-  Value *createSimpleTargetReduction(RecurKind Kind, Type *ValTy,
+  Value *createSimpleTargetReduction(Intrinsic::ID RdxID, Type *ValTy,
                                      ArrayRef<Value *> VecOpArray,
                                      const Twine &Name = Twine());
 };
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index b01a447f3c28b..56880bd4822c7 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -359,6 +359,10 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                         SinkAndHoistLICMFlags &LICMFlags,
                         OptimizationRemarkEmitter *ORE = nullptr);
 
+/// Returns the llvm.vector.reduce intrinsic that corresponds to the recurrence
+/// kind.
+constexpr Intrinsic::ID getReductionIntrinsicID(RecurKind RK);
+
 /// Returns the arithmetic instruction opcode used when expanding a reduction.
 unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID);
 
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 64a14da55b15e..db3b0196f66fd 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -599,6 +599,25 @@ Intrinsic::ID VPIntrinsic::getForOpcode(unsigned IROPC) {
   return Intrinsic::not_intrinsic;
 }
 
+constexpr static Intrinsic::ID getForIntrinsic(Intrinsic::ID Id) {
+  if (::isVPIntrinsic(Id))
+    return Id;
+
+  switch (Id) {
+  default:
+    break;
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) break;
+#define VP_PROPERTY_FUNCTIONAL_INTRINSIC(INTRIN) case Intrinsic::INTRIN:
+#define END_REGISTER_VP_INTRINSIC(VPID) return Intrinsic::VPID;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+  return Intrinsic::not_intrinsic;
+}
+
+Intrinsic::ID VPIntrinsic::getForIntrinsic(Intrinsic::ID Id) {
+  return ::getForIntrinsic(Id);
+}
+
 bool VPIntrinsic::canIgnoreVectorLengthParam() const {
   using namespace PatternMatch;
 
diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp
index 5ff3082879895..8dbf25277bf5d 100644
--- a/llvm/lib/IR/VectorBuilder.cpp
+++ b/llvm/lib/IR/VectorBuilder.cpp
@@ -60,60 +60,13 @@ Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy,
   return createVectorInstructionImpl(VPID, ReturnTy, InstOpArray, Name);
 }
 
-Value *VectorBuilder::createSimpleTargetReduction(RecurKind Kind, Type *ValTy,
+Value *VectorBuilder::createSimpleTargetReduction(Intrinsic::ID RdxID,
+                                                  Type *ValTy,
                                                   ArrayRef<Value *> InstOpArray,
                                                   const Twine &Name) {
-  Intrinsic::ID VPID;
-  switch (Kind) {
-  case RecurKind::Add:
-    VPID = Intrinsic::vp_reduce_add;
-    break;
-  case RecurKind::Mul:
-    VPID = Intrinsic::vp_reduce_mul;
-    break;
-  case RecurKind::And:
-    VPID = Intrinsic::vp_reduce_and;
-    break;
-  case RecurKind::Or:
-    VPID = Intrinsic::vp_reduce_or;
-    break;
-  case RecurKind::Xor:
-    VPID = Intrinsic::vp_reduce_xor;
-    break;
-  case RecurKind::FMulAdd:
-  case RecurKind::FAdd:
-    VPID = Intrinsic::vp_reduce_fadd;
-    break;
-  case RecurKind::FMul:
-    VPID = Intrinsic::vp_reduce_fmul;
-    break;
-  case RecurKind::SMax:
-    VPID = Intrinsic::vp_reduce_smax;
-    break;
-  case RecurKind::SMin:
-    VPID = Intrinsic::vp_reduce_smin;
-    break;
-  case RecurKind::UMax:
-    VPID = Intrinsic::vp_reduce_umax;
-    break;
-  case RecurKind::UMin:
-    VPID = Intrinsic::vp_reduce_umin;
-    break;
-  case RecurKind::FMax:
-    VPID = Intrinsic::vp_reduce_fmax;
-    break;
-  case RecurKind::FMin:
-    VPID = Intrinsic::vp_reduce_fmin;
-    break;
-  case RecurKind::FMaximum:
-    VPID = Intrinsic::vp_reduce_fmaximum;
-    break;
-  case RecurKind::FMinimum:
-    VPID = Intrinsic::vp_reduce_fminimum;
-    break;
-  default:
-    llvm_unreachable("No VPIntrinsic for this reduction");
-  }
+  auto VPID = VPIntrinsic::getForIntrinsic(RdxID);
+  assert(VPReductionIntrinsic::isVPReduction(VPID) &&
+         "No VPIntrinsic for this reduction");
   return createVectorInstructionImpl(VPID, ValTy, InstOpArray, Name);
 }
 
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 4609376a748f9..0abf6d77496dc 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -918,6 +918,44 @@ bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,
   return true;
 }
 
+constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) {
+  switch (RK) {
+  default:
+    llvm_unreachable("Unexpected recurrence kind");
+  case RecurKind::Add:
+    return Intrinsic::vector_reduce_add;
+  case RecurKind::Mul:
+    return Intrinsic::vector_reduce_mul;
+  case RecurKind::And:
+    return Intrinsic::vector_reduce_and;
+  case RecurKind::Or:
+    return Intrinsic::vector_reduce_or;
+  case RecurKind::Xor:
+    return Intrinsic::vector_reduce_xor;
+  case RecurKind::FMulAdd:
+  case RecurKind::FAdd:
+    return Intrinsic::vector_reduce_fadd;
+  case RecurKind::FMul:
+    return Intrinsic::vector_reduce_fmul;
+  case RecurKind::SMax:
+    return Intrinsic::vector_reduce_smax;
+  case RecurKind::SMin:
+    return Intrinsic::vector_reduce_smin;
+  case RecurKind::UMax:
+    return Intrinsic::vector_reduce_umax;
+  case RecurKind::UMin:
+    return Intrinsic::vector_reduce_umin;
+  case RecurKind::FMax:
+    return Intrinsic::vector_reduce_fmax;
+  case RecurKind::FMin:
+    return Intrinsic::vector_reduce_fmin;
+  case RecurKind::FMaximum:
+    return Intrinsic::vector_reduce_fmaximum;
+  case RecurKind::FMinimum:
+    return Intrinsic::vector_reduce_fminimum;
+  }
+}
+
 unsigned llvm::getArithmeticReductionInstruction(Intrinsic::ID RdxID) {
   switch (RdxID) {
   case Intrinsic::vector_reduce_fadd:
@@ -1215,12 +1253,13 @@ Value *llvm::createSimpleTargetReduction(VectorBuilder &VBuilder, Value *Src,
   RecurKind Kind = Desc.getRecurrenceKind();
   assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
          "AnyOf reduction is not supported.");
+  Intrinsic::ID Id = getReductionIntrinsicID(Kind);
   auto *SrcTy = cast<VectorType>(Src->getType());
   Type *SrcEltTy = SrcTy->getElementType();
   Value *Iden =
       Desc.getRecurrenceIdentity(Kind, SrcEltTy, Desc.getFastMathFlags());
   Value *Ops[] = {Iden, Src};
-  return VBuilder.createSimpleTargetReduction(Kind, SrcTy, Ops);
+  return VBuilder.createSimpleTargetReduction(Id, SrcTy, Ops);
 }
 
 Value *llvm::createTargetReduction(IRBuilderBase &B,
@@ -1260,9 +1299,10 @@ Value *llvm::createOrderedReduction(VectorBuilder &VBuilder,
   assert(Src->getType()->isVectorTy() && "Expected a vector type");
   assert(!Start->getType()->isVectorTy() && "Expected a scalar type");
 
+  Intrinsic::ID Id = getReductionIntrinsicID(RecurKind::FAdd);
   auto *SrcTy = cast<VectorType>(Src->getType());
   Value *Ops[] = {Start, Src};
-  return VBuilder.createSimpleTargetReduction(RecurKind::FAdd, SrcTy, Ops);
+  return VBuilder.createSimpleTargetReduction(Id, SrcTy, Ops);
 }
 
 void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue,
diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp
index eab2850ca4e1e..cf0a10d1f2e95 100644
--- a/llvm/unittests/IR/VPIntrinsicTest.cpp
+++ b/llvm/unittests/IR/VPIntrinsicTest.cpp
@@ -367,6 +367,59 @@ TEST_F(VPIntrinsicTest, IntrinsicIDRoundTrip) {
   ASSERT_NE(FullTripCounts, 0u);
 }
 
+/// Check that going from intrinsic to VP intrinsic and back results in the same
+/// intrinsic.
+TEST_F(VPIntrinsicTest, IntrinsicToVPRoundTrip) {
+  bool IsFullTrip = false;
+  Intrinsic::ID IntrinsicID = Intrinsic::not_intrinsic + 1;
+  for (; IntrinsicID < Intrinsic::num_intrinsics; IntrinsicID++) {
+    Intrinsic::ID VPID = VPIntrinsic::getForIntrinsic(IntrinsicID);
+    // No equivalent VP intrinsic available.
+    if (VPID == Intrinsic::not_intrinsic)
+      continue;
+
+    // Return itself if passed intrinsic ID is VP intrinsic.
+    if (VPIntrinsic::isVPIntrinsic(IntrinsicID)) {
+      ASSERT_EQ(IntrinsicID, VPID);
+      continue;
+    }
+
+    std::optional<Intrinsic::ID> RoundTripIntrinsicID =
+        VPIntrinsic::getFunctionalIntrinsicIDForVP(VPID);
+    // No equivalent non-predicated intrinsic available.
+    if (!RoundTripIntrinsicID)
+      continue;
+
+    ASSERT_EQ(*RoundTripIntrinsicID, IntrinsicID);
+    IsFullTrip = true;
+  }
+  ASSERT_TRUE(IsFullTrip);
+}
+
+/// Check that going from VP intrinsic to equivalent non-predicated intrinsic
+/// and back results in the same intrinsic.
+TEST_F(VPIntrinsicTest, VPToNonPredIntrinsicRoundTrip) {
+  std::unique_ptr<Module> M = createVPDeclarationModule();
+  assert(M);
+
+  bool IsFullTrip = false;
+  for (const auto &VPDecl : *M) {
+    auto VPID = VPDecl.getIntrinsicID();
+    std::optional<Intrinsic::ID> NonPredID =
+        VPIntrinsic::getFunctionalIntrinsicIDForVP(VPID);
+
+    // No equivalent non-predicated intrinsic available
+    if (!NonPredID)
+      continue;
+
+    Intrinsic::ID RoundTripVPID = VPIntrinsic::getForIntrinsic(*NonPredID);
+
+    ASSERT_EQ(RoundTripVPID, VPID);
+    IsFullTrip = true;
+  }
+  ASSERT_TRUE(IsFullTrip);
+}
+
 /// Check that VPIntrinsic::getDeclarationForParams works.
 TEST_F(VPIntrinsicTest, VPIntrinsicDeclarationForParams) {
   std::unique_ptr<Module> M = createVPDeclarationModule();

>From 73d862e478738675f5d919c6a196429acd7b5f50 Mon Sep 17 00:00:00 2001
From: Muhammad Omair Javaid <omair.javaid at linaro.org>
Date: Thu, 25 Jul 2024 12:21:16 +0500
Subject: [PATCH 100/259] Revert "[LLVM] Silence compiler-rt warning in
 runtimes build (#99525)"

This patch broke LLVM Flang build on Windows. PR #100202
This reverts commit f6f88f4b99638821af803d1911ab6a7dac04880b.
---
 llvm/cmake/modules/LLVMExternalProjectUtils.cmake | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/cmake/modules/LLVMExternalProjectUtils.cmake b/llvm/cmake/modules/LLVMExternalProjectUtils.cmake
index cd071d50bdce9..eef0c16f6847e 100644
--- a/llvm/cmake/modules/LLVMExternalProjectUtils.cmake
+++ b/llvm/cmake/modules/LLVMExternalProjectUtils.cmake
@@ -350,7 +350,6 @@ function(llvm_ExternalProject_Add name source_dir)
                ${sysroot_arg}
                -DLLVM_BINARY_DIR=${PROJECT_BINARY_DIR}
                -DLLVM_CONFIG_PATH=${llvm_config_path}
-               -DLLVM_CMAKE_DIR=${LLVM_CMAKE_DIR}
                -DLLVM_ENABLE_WERROR=${LLVM_ENABLE_WERROR}
                -DLLVM_HOST_TRIPLE=${LLVM_HOST_TRIPLE}
                -DLLVM_HAVE_LINK_VERSION_SCRIPT=${LLVM_HAVE_LINK_VERSION_SCRIPT}

>From dfe650cdc884af3298cb3378aa1631545a13b720 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me at antoniofrighetto.com>
Date: Thu, 25 Jul 2024 09:18:20 +0200
Subject: [PATCH 101/259] [TBAA] Introduce test for PR96483 (NFC)

---
 .../InstCombine/struct-assign-tbaa-2.ll       | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/struct-assign-tbaa-2.ll

diff --git a/llvm/test/Transforms/InstCombine/struct-assign-tbaa-2.ll b/llvm/test/Transforms/InstCombine/struct-assign-tbaa-2.ll
new file mode 100644
index 0000000000000..ae0d2f80ae458
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/struct-assign-tbaa-2.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind
+
+%struct.T = type { %struct.Wrapper, %struct.Wrapper }
+%struct.Wrapper = type { i16 }
+
+define void @test1(ptr %a1, ptr %a2) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i8, ptr [[A2:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A2]], align 2, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    store i16 [[TMP0]], ptr [[A1:%.*]], align 2, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[B2:%.*]] = getelementptr inbounds i8, ptr [[A1]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B]], align 2, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i16 [[TMP1]], ptr [[B2]], align 2, !tbaa [[TBAA0]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b = getelementptr inbounds i8, ptr %a2, i64 2
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %a1, ptr align 2 %a2, i64 2, i1 false), !tbaa !0, !tbaa.struct !6
+  %b2 = getelementptr inbounds %struct.T, ptr %a1, i32 0, i32 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %b2, ptr align 2 %b, i64 2, i1 false), !tbaa !8, !tbaa.struct !6
+  ret void
+}
+
+!0 = !{!1, !4, i64 0, i64 2}
+!1 = !{!2, i64 4, !"_ZTS1T", !4, i64 0, i64 2, !4, i64 2, i64 2}
+!2 = !{!3, i64 1, !"omnipotent char"}
+!3 = !{!"Simple C++ TBAA"}
+!4 = !{!2, i64 2, !"_ZTS7Wrapper", !5, i64 0, i64 2}
+!5 = !{!2, i64 2, !"short"}
+!6 = !{i64 0, i64 2, !7}
+!7 = !{!5, !5, i64 0, i64 2}
+!8 = !{!1, !4, i64 2, i64 2}
+
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0, i64 2}
+; CHECK: [[META1]] = !{[[META2:![0-9]+]], i64 2, !"short"}
+; CHECK: [[META2]] = !{[[META3:![0-9]+]], i64 1, !"omnipotent char"}
+; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
+;.

>From 6ce7b1f86115f475ee00003ee04833781a675e3e Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me at antoniofrighetto.com>
Date: Thu, 25 Jul 2024 09:18:53 +0200
Subject: [PATCH 102/259] [TBAA] Do not rewrite TBAA if exists, always null out
 `!tbaa.struct`

Retrieve `!tbaa` metadata via `!tbaa.struct` in `adjustForAccess`
unless it already exists, as struct-path aware `MDNodes` emitted
via `new-struct-path-tbaa` may be leveraged. As `!tbaa.struct`
carries memcpy padding semantics among struct fields and `!tbaa`
is already meant to aid to alias semantics, it should be possible
to zero out `!tbaa.struct` once the memcpy has been simplified.
`SROA/tbaa-struct.ll` test has gone out of scope, as `!tbaa` has
already replaced `!tbaa.struct` in SROA.

Fixes: https://github.com/llvm/llvm-project/issues/95661.
---
 llvm/include/llvm/IR/Metadata.h               |  6 ++-
 llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp  |  8 +--
 .../InstCombine/InstCombineCalls.cpp          |  2 +-
 .../InstCombine/struct-assign-tbaa-2.ll       | 11 ++--
 .../InstCombine/struct-assign-tbaa.ll         |  9 ++--
 llvm/test/Transforms/SROA/tbaa-struct.ll      | 41 --------------
 llvm/test/Transforms/SROA/tbaa-struct3.ll     | 54 +++++++++----------
 7 files changed, 45 insertions(+), 86 deletions(-)
 delete mode 100644 llvm/test/Transforms/SROA/tbaa-struct.ll

diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h
index 22da54a1f03c5..7b54c74fb1b9d 100644
--- a/llvm/include/llvm/IR/Metadata.h
+++ b/llvm/include/llvm/IR/Metadata.h
@@ -846,8 +846,10 @@ struct AAMDNodes {
   AAMDNodes concat(const AAMDNodes &Other) const;
 
   /// Create a new AAMDNode for accessing \p AccessSize bytes of this AAMDNode.
-  /// If his AAMDNode has !tbaa.struct and \p AccessSize matches the size of the
-  /// field at offset 0, get the TBAA tag describing the accessed field.
+  /// If this AAMDNode has !tbaa.struct and \p AccessSize matches the size of
+  /// the field at offset 0, get the TBAA tag describing the accessed field.
+  /// If such an AAMDNode already embeds !tbaa, the existing one is retrieved.
+  /// Finally, !tbaa.struct is zeroed out.
   AAMDNodes adjustForAccess(unsigned AccessSize);
   AAMDNodes adjustForAccess(size_t Offset, Type *AccessTy,
                             const DataLayout &DL);
diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index e1cb63a9ab8f9..0d7eb7da8d6b6 100644
--- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -822,16 +822,16 @@ MDNode *AAMDNodes::extendToTBAA(MDNode *MD, ssize_t Len) {
 AAMDNodes AAMDNodes::adjustForAccess(unsigned AccessSize) {
   AAMDNodes New = *this;
   MDNode *M = New.TBAAStruct;
-  if (M && M->getNumOperands() >= 3 && M->getOperand(0) &&
+  if (!New.TBAA && M && M->getNumOperands() >= 3 && M->getOperand(0) &&
       mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
       mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
       M->getOperand(1) && mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
       mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
           AccessSize &&
-      M->getOperand(2) && isa<MDNode>(M->getOperand(2))) {
-    New.TBAAStruct = nullptr;
+      M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
     New.TBAA = cast<MDNode>(M->getOperand(2));
-  }
+
+  New.TBAAStruct = nullptr;
   return New;
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 3502bbbbeae25..f6c4b6e180937 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -171,7 +171,7 @@ Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
   IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3);
 
   // If the memcpy has metadata describing the members, see if we can get the
-  // TBAA tag describing our copy.
+  // TBAA, scope and noalias tags describing our copy.
   AAMDNodes AACopyMD = MI->getAAMetadata().adjustForAccess(Size);
 
   Value *Src = MI->getArgOperand(1);
diff --git a/llvm/test/Transforms/InstCombine/struct-assign-tbaa-2.ll b/llvm/test/Transforms/InstCombine/struct-assign-tbaa-2.ll
index ae0d2f80ae458..b52a062fc6404 100644
--- a/llvm/test/Transforms/InstCombine/struct-assign-tbaa-2.ll
+++ b/llvm/test/Transforms/InstCombine/struct-assign-tbaa-2.ll
@@ -15,8 +15,8 @@ define void @test1(ptr %a1, ptr %a2) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A2]], align 2, !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    store i16 [[TMP0]], ptr [[A1:%.*]], align 2, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[B2:%.*]] = getelementptr inbounds i8, ptr [[A1]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B]], align 2, !tbaa [[TBAA0]]
-; CHECK-NEXT:    store i16 [[TMP1]], ptr [[B2]], align 2, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B]], align 2, !tbaa [[TBAA6:![0-9]+]]
+; CHECK-NEXT:    store i16 [[TMP1]], ptr [[B2]], align 2, !tbaa [[TBAA6]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -38,8 +38,11 @@ entry:
 !8 = !{!1, !4, i64 2, i64 2}
 
 ;.
-; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0, i64 2}
-; CHECK: [[META1]] = !{[[META2:![0-9]+]], i64 2, !"short"}
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META4:![0-9]+]], i64 0, i64 2}
+; CHECK: [[META1]] = !{[[META2:![0-9]+]], i64 4, !"_ZTS1T", [[META4]], i64 0, i64 2, [[META4]], i64 2, i64 2}
 ; CHECK: [[META2]] = !{[[META3:![0-9]+]], i64 1, !"omnipotent char"}
 ; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
+; CHECK: [[META4]] = !{[[META2]], i64 2, !"_ZTS7Wrapper", [[META5:![0-9]+]], i64 0, i64 2}
+; CHECK: [[META5]] = !{[[META2]], i64 2, !"short"}
+; CHECK: [[TBAA6]] = !{[[META1]], [[META4]], i64 2, i64 2}
 ;.
diff --git a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
index 996d2c0e67e16..e96452a3cebc8 100644
--- a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
+++ b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
@@ -38,8 +38,8 @@ define ptr @test2() {
 define void @test3_multiple_fields(ptr nocapture %a, ptr nocapture %b) {
 ; CHECK-LABEL: @test3_multiple_fields(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[B:%.*]], align 4, !tbaa.struct [[TBAA_STRUCT3:![0-9]+]]
-; CHECK-NEXT:    store i64 [[TMP0]], ptr [[A:%.*]], align 4, !tbaa.struct [[TBAA_STRUCT3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i64 [[TMP0]], ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -50,8 +50,8 @@ entry:
 define void @test4_multiple_copy_first_field(ptr nocapture %a, ptr nocapture %b) {
 ; CHECK-LABEL: @test4_multiple_copy_first_field(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
-; CHECK-NEXT:    store i32 [[TMP0]], ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[A:%.*]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -86,5 +86,4 @@ entry:
 ; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; CHECK: [[META1]] = !{!"float", [[META2:![0-9]+]]}
 ; CHECK: [[META2]] = !{!"Simple C/C++ TBAA"}
-; CHECK: [[TBAA_STRUCT3]] = !{i64 0, i64 4, [[TBAA0]], i64 4, i64 4, [[TBAA0]]}
 ;.
diff --git a/llvm/test/Transforms/SROA/tbaa-struct.ll b/llvm/test/Transforms/SROA/tbaa-struct.ll
deleted file mode 100644
index 29892cb84d8ef..0000000000000
--- a/llvm/test/Transforms/SROA/tbaa-struct.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
-; RUN: opt -S -passes='sroa<preserve-cfg>' %s | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
-; RUN: opt -S -passes='sroa<modify-cfg>' %s | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
-
-; SROA should keep `!tbaa.struct` metadata
-
-%vector = type { float, float }
-declare void @llvm.memcpy.p0.p0.i64(ptr writeonly, ptr readonly, i64, i1 immarg)
-declare <2 x float> @foo(ptr %0)
-
-define void @bar(ptr %y2) {
-; CHECK-LABEL: @bar(
-; CHECK-NEXT:    [[X14:%.*]] = call <2 x float> @foo(ptr [[Y2:%.*]])
-; CHECK-NEXT:    store <2 x float> [[X14]], ptr [[Y2]], align 4, !tbaa.struct [[TBAA_STRUCT0:![0-9]+]]
-; CHECK-NEXT:    ret void
-;
-  %x7 = alloca %vector
-  %x14 = call <2 x float> @foo(ptr %y2)
-  store <2 x float> %x14, ptr %x7
-  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %y2, ptr align 4 %x7, i64 8, i1 false), !tbaa.struct !10
-  ret void
-}
-
-!4 = !{!"omnipotent char", !5, i64 0}
-!5 = !{!"Simple C++ TBAA"}
-!7 = !{!"vector", !8, i64 0, !8, i64 4}
-!8 = !{!"float", !4, i64 0}
-!10 = !{i64 0, i64 4, !11, i64 4, i64 4, !11}
-!11 = !{!8, !8, i64 0}
-;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-;.
-; CHECK: [[TBAA_STRUCT0]] = !{i64 0, i64 4, !1, i64 4, i64 4, !1}
-; CHECK: [[META1:![0-9]+]] = !{!2, !2, i64 0}
-; CHECK: [[META2:![0-9]+]] = !{!"float", !3, i64 0}
-; CHECK: [[META3:![0-9]+]] = !{!"omnipotent char", !4, i64 0}
-; CHECK: [[META4:![0-9]+]] = !{!"Simple C++ TBAA"}
-;.
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-MODIFY-CFG: {{.*}}
-; CHECK-PRESERVE-CFG: {{.*}}
diff --git a/llvm/test/Transforms/SROA/tbaa-struct3.ll b/llvm/test/Transforms/SROA/tbaa-struct3.ll
index 0fcd787fef976..6a1a23728d3c7 100644
--- a/llvm/test/Transforms/SROA/tbaa-struct3.ll
+++ b/llvm/test/Transforms/SROA/tbaa-struct3.ll
@@ -56,7 +56,7 @@ define void @memcpy_transfer_tbaa_field_and_size_do_not_align(ptr dereferenceabl
 ; CHECK-NEXT:    [[TMP_SROA_2_0_L_PTR_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[L_PTR]], i64 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[B]] to i32
 ; CHECK-NEXT:    [[TMP_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-; CHECK-NEXT:    store i16 [[TMP_SROA_2_0_EXTRACT_TRUNC]], ptr [[TMP_SROA_2_0_L_PTR_SROA_IDX]], align 1, !tbaa.struct [[TBAA_STRUCT4:![0-9]+]]
+; CHECK-NEXT:    store i16 [[TMP_SROA_2_0_EXTRACT_TRUNC]], ptr [[TMP_SROA_2_0_L_PTR_SROA_IDX]], align 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -77,8 +77,8 @@ define void @load_store_transfer_split_struct_tbaa_2_i31(ptr dereferenceable(24)
 ; CHECK-NEXT:    store i31 [[A]], ptr [[TMP]], align 4
 ; CHECK-NEXT:    [[TMP_4_TMP_4_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 4
 ; CHECK-NEXT:    store i31 [[B]], ptr [[TMP_4_TMP_4_SROA_IDX]], align 4
-; CHECK-NEXT:    [[TMP_0_L1:%.*]] = load i62, ptr [[TMP]], align 4, !tbaa.struct [[TBAA_STRUCT5:![0-9]+]]
-; CHECK-NEXT:    store i62 [[TMP_0_L1]], ptr [[RES]], align 4, !tbaa.struct [[TBAA_STRUCT5]]
+; CHECK-NEXT:    [[TMP_0_L1:%.*]] = load i62, ptr [[TMP]], align 4, !tbaa.struct [[TBAA_STRUCT4:![0-9]+]]
+; CHECK-NEXT:    store i62 [[TMP_0_L1]], ptr [[RES]], align 4, !tbaa.struct [[TBAA_STRUCT4]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -98,7 +98,7 @@ define void @store_vector_part_first(ptr %y2, float %f) {
 ; CHECK-LABEL: define void @store_vector_part_first(
 ; CHECK-SAME: ptr [[Y2:%.*]], float [[F:%.*]]) {
 ; CHECK-NEXT:    [[V_1:%.*]] = call <2 x float> @foo(ptr [[Y2]])
-; CHECK-NEXT:    store <2 x float> [[V_1]], ptr [[Y2]], align 8, !tbaa [[TBAA6:![0-9]+]]
+; CHECK-NEXT:    store <2 x float> [[V_1]], ptr [[Y2]], align 8, !tbaa [[TBAA5:![0-9]+]]
 ; CHECK-NEXT:    [[X7_SROA_2_0_Y2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[Y2]], i64 8
 ; CHECK-NEXT:    store float [[F]], ptr [[X7_SROA_2_0_Y2_SROA_IDX]], align 8, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
@@ -118,7 +118,7 @@ define void @store_vector_part_second(ptr %y2, float %f) {
 ; CHECK-NEXT:    [[V_1:%.*]] = call <2 x float> @foo(ptr [[Y2]])
 ; CHECK-NEXT:    store float [[F]], ptr [[Y2]], align 8, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[X7_SROA_2_0_Y2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[Y2]], i64 4
-; CHECK-NEXT:    store <2 x float> [[V_1]], ptr [[X7_SROA_2_0_Y2_SROA_IDX]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT:    store <2 x float> [[V_1]], ptr [[X7_SROA_2_0_Y2_SROA_IDX]], align 4, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    ret void
 ;
   %x7 = alloca { float, float, float, float }
@@ -134,7 +134,7 @@ define void @store_vector_single(ptr %y2, float %f) {
 ; CHECK-LABEL: define void @store_vector_single(
 ; CHECK-SAME: ptr [[Y2:%.*]], float [[F:%.*]]) {
 ; CHECK-NEXT:    [[V_1:%.*]] = call <2 x float> @foo(ptr [[Y2]])
-; CHECK-NEXT:    store <2 x float> [[V_1]], ptr [[Y2]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT:    store <2 x float> [[V_1]], ptr [[Y2]], align 4, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    ret void
 ;
   %x7 = alloca { float, float }
@@ -161,7 +161,7 @@ define void @memset(ptr %dst, ptr align 8 %src) {
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_4]], ptr align 1 [[A_SROA_4_0_SRC_SROA_IDX]], i32 10, i1 false)
 ; CHECK-NEXT:    store i16 1, ptr [[A_SROA_3]], align 2
 ; CHECK-NEXT:    [[A_SROA_0_1_A_1_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i64 1
-; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 1 [[A_SROA_0_1_A_1_SROA_IDX2]], i8 42, i32 6, i1 false), !tbaa.struct [[TBAA_STRUCT8:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 1 [[A_SROA_0_1_A_1_SROA_IDX2]], i8 42, i32 6, i1 false)
 ; CHECK-NEXT:    store i16 10794, ptr [[A_SROA_3]], align 2, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[DST]], ptr align 1 [[A_SROA_0]], i32 7, i1 true)
 ; CHECK-NEXT:    [[A_SROA_3_0_DST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 7
@@ -199,8 +199,8 @@ define void @memset2(ptr %dst, ptr align 8 %src) {
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_4]], ptr align 2 [[A_SROA_4_0_SRC_SROA_IDX]], i32 90, i1 false)
 ; CHECK-NEXT:    store i8 1, ptr [[A_SROA_3]], align 1
 ; CHECK-NEXT:    [[A_SROA_0_202_A_202_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i64 202
-; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 1 [[A_SROA_0_202_A_202_SROA_IDX2]], i8 42, i32 7, i1 false), !tbaa [[TBAA6]]
-; CHECK-NEXT:    store i8 42, ptr [[A_SROA_3]], align 1, !tbaa [[TBAA6]]
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 1 [[A_SROA_0_202_A_202_SROA_IDX2]], i8 42, i32 7, i1 false), !tbaa [[TBAA5]]
+; CHECK-NEXT:    store i8 42, ptr [[A_SROA_3]], align 1, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[DST]], ptr align 1 [[A_SROA_0]], i32 209, i1 true)
 ; CHECK-NEXT:    [[A_SROA_3_0_DST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 209
 ; CHECK-NEXT:    [[A_SROA_3_0_A_SROA_3_0_COPYLOAD1:%.*]] = load volatile i8, ptr [[A_SROA_3]], align 1
@@ -240,7 +240,7 @@ define void @slice_store_v2i8_1(ptr %dst, ptr %dst.2, ptr %src) {
 ; CHECK-NEXT:    [[A_SROA_2_0_SRC_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 6
 ; CHECK-NEXT:    [[A_SROA_2_SROA_0_0_COPYLOAD:%.*]] = load <2 x i8>, ptr [[A_SROA_2_0_SRC_SROA_IDX]], align 2
 ; CHECK-NEXT:    store <2 x i8> [[A_SROA_2_SROA_0_0_COPYLOAD]], ptr [[A_SROA_2_SROA_0]], align 4
-; CHECK-NEXT:    store <2 x i8> bitcast (<1 x i16> <i16 123> to <2 x i8>), ptr [[A_SROA_2_SROA_0]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store <2 x i8> bitcast (<1 x i16> <i16 123> to <2 x i8>), ptr [[A_SROA_2_SROA_0]], align 4
 ; CHECK-NEXT:    [[A_SROA_2_SROA_0_0_A_SROA_2_SROA_0_0_A_SROA_2_6_V_4:%.*]] = load <2 x i8>, ptr [[A_SROA_2_SROA_0]], align 4
 ; CHECK-NEXT:    store <2 x i8> [[A_SROA_2_SROA_0_0_A_SROA_2_SROA_0_0_A_SROA_2_6_V_4]], ptr [[DST_2]], align 2
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[DST]], ptr align 1 [[A_SROA_0]], i32 6, i1 true)
@@ -279,8 +279,8 @@ define void @slice_store_v2i8_2(ptr %dst, ptr %dst.2, ptr %src) {
 ; CHECK-NEXT:    store i8 [[A_SROA_0_SROA_4_1_COPYLOAD]], ptr [[A_SROA_0_SROA_4]], align 1
 ; CHECK-NEXT:    [[A_SROA_4_1_SRC_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 3
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[A_SROA_4]], ptr align 1 [[A_SROA_4_1_SRC_SROA_IDX]], i32 5, i1 false)
-; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[A_SROA_0_SROA_1]], align 2, !tbaa.struct [[TBAA_STRUCT9:![0-9]+]]
-; CHECK-NEXT:    store i8 0, ptr [[A_SROA_0_SROA_4]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[A_SROA_0_SROA_1]], align 2
+; CHECK-NEXT:    store i8 0, ptr [[A_SROA_0_SROA_4]], align 1
 ; CHECK-NEXT:    [[A_SROA_0_SROA_1_0_A_SROA_0_SROA_1_1_A_SROA_0_1_V_4:%.*]] = load <2 x i8>, ptr [[A_SROA_0_SROA_1]], align 2
 ; CHECK-NEXT:    store <2 x i8> [[A_SROA_0_SROA_1_0_A_SROA_0_SROA_1_1_A_SROA_0_1_V_4]], ptr [[DST_2]], align 2
 ; CHECK-NEXT:    [[A_SROA_0_SROA_1_0_A_SROA_0_SROA_1_1_COPYLOAD3:%.*]] = load volatile <2 x i8>, ptr [[A_SROA_0_SROA_1]], align 2
@@ -317,7 +317,7 @@ define double @tbaa_struct_load(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[TMP_SROA_3_0_SRC_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 8
 ; CHECK-NEXT:    [[TMP_SROA_3_0_COPYLOAD:%.*]] = load i64, ptr [[TMP_SROA_3_0_SRC_SROA_IDX]], align 8
 ; CHECK-NEXT:    store i64 [[TMP_SROA_3_0_COPYLOAD]], ptr [[TMP_SROA_3]], align 8
-; CHECK-NEXT:    [[TMP_SROA_0_0_TMP_SROA_0_0_LG:%.*]] = load double, ptr [[TMP_SROA_0]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP_SROA_0_0_TMP_SROA_0_0_LG:%.*]] = load double, ptr [[TMP_SROA_0]], align 8, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[TMP_SROA_0_0_TMP_SROA_0_0_COPYLOAD1:%.*]] = load volatile double, ptr [[TMP_SROA_0]], align 8
 ; CHECK-NEXT:    store volatile double [[TMP_SROA_0_0_TMP_SROA_0_0_COPYLOAD1]], ptr [[DST]], align 8
 ; CHECK-NEXT:    [[TMP_SROA_3_0_DST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 8
@@ -356,7 +356,7 @@ define i32 @shorten_integer_store_multiple_fields(ptr %dst, ptr %dst.2, ptr %src
 ; CHECK-SAME: ptr [[DST:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 123, ptr [[A_SROA_0]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT:    store i32 123, ptr [[A_SROA_0]], align 4, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[A_SROA_0_0_A_SROA_0_0_L:%.*]] = load i32, ptr [[A_SROA_0]], align 4
 ; CHECK-NEXT:    [[A_SROA_0_0_A_SROA_0_0_COPYLOAD:%.*]] = load volatile i32, ptr [[A_SROA_0]], align 4
 ; CHECK-NEXT:    store volatile i32 [[A_SROA_0_0_A_SROA_0_0_COPYLOAD]], ptr [[DST]], align 1
@@ -375,7 +375,7 @@ define <2 x i16> @shorten_vector_store_multiple_fields(ptr %dst, ptr %dst.2, ptr
 ; CHECK-SAME: ptr [[DST:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca <2 x i32>, align 8
-; CHECK-NEXT:    store <2 x i32> <i32 1, i32 2>, ptr [[A_SROA_0]], align 8, !tbaa.struct [[TBAA_STRUCT5]]
+; CHECK-NEXT:    store <2 x i32> <i32 1, i32 2>, ptr [[A_SROA_0]], align 8
 ; CHECK-NEXT:    [[A_SROA_0_0_A_SROA_0_0_L:%.*]] = load <2 x i16>, ptr [[A_SROA_0]], align 8
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[DST]], ptr align 8 [[A_SROA_0]], i32 4, i1 true)
 ; CHECK-NEXT:    ret <2 x i16> [[A_SROA_0_0_A_SROA_0_0_L]]
@@ -393,7 +393,7 @@ define <2 x i16> @shorten_vector_store_single_fields(ptr %dst, ptr %dst.2, ptr %
 ; CHECK-SAME: ptr [[DST:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca <2 x i32>, align 8
-; CHECK-NEXT:    store <2 x i32> <i32 1, i32 2>, ptr [[A_SROA_0]], align 8, !tbaa.struct [[TBAA_STRUCT10:![0-9]+]]
+; CHECK-NEXT:    store <2 x i32> <i32 1, i32 2>, ptr [[A_SROA_0]], align 8
 ; CHECK-NEXT:    [[A_SROA_0_0_A_SROA_0_0_L:%.*]] = load <2 x i16>, ptr [[A_SROA_0]], align 8
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[DST]], ptr align 8 [[A_SROA_0]], i32 4, i1 true)
 ; CHECK-NEXT:    ret <2 x i16> [[A_SROA_0_0_A_SROA_0_0_L]]
@@ -429,11 +429,11 @@ define i32 @split_load_with_tbaa_struct(i32 %x, ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[A3_SROA_5_0_SRC_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9
 ; CHECK-NEXT:    [[A3_SROA_5_0_COPYLOAD:%.*]] = load i8, ptr [[A3_SROA_5_0_SRC_SROA_IDX]], align 1
 ; CHECK-NEXT:    store i8 [[A3_SROA_5_0_COPYLOAD]], ptr [[A3_SROA_5]], align 1
-; CHECK-NEXT:    [[A3_SROA_0_0_A3_SROA_0_0_LOAD4_FCA_0_LOAD:%.*]] = load i16, ptr [[A3_SROA_0]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[A3_SROA_0_0_A3_SROA_0_0_LOAD4_FCA_0_LOAD:%.*]] = load i16, ptr [[A3_SROA_0]], align 8, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[LOAD4_FCA_0_INSERT:%.*]] = insertvalue { i16, float, i8 } poison, i16 [[A3_SROA_0_0_A3_SROA_0_0_LOAD4_FCA_0_LOAD]], 0
-; CHECK-NEXT:    [[A3_SROA_33_0_A3_SROA_33_4_LOAD4_FCA_1_LOAD:%.*]] = load float, ptr [[A3_SROA_33]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[A3_SROA_33_0_A3_SROA_33_4_LOAD4_FCA_1_LOAD:%.*]] = load float, ptr [[A3_SROA_33]], align 4, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[LOAD4_FCA_1_INSERT:%.*]] = insertvalue { i16, float, i8 } [[LOAD4_FCA_0_INSERT]], float [[A3_SROA_33_0_A3_SROA_33_4_LOAD4_FCA_1_LOAD]], 1
-; CHECK-NEXT:    [[A3_SROA_4_0_A3_SROA_4_8_LOAD4_FCA_2_LOAD:%.*]] = load i8, ptr [[A3_SROA_4]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[A3_SROA_4_0_A3_SROA_4_8_LOAD4_FCA_2_LOAD:%.*]] = load i8, ptr [[A3_SROA_4]], align 8, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[LOAD4_FCA_2_INSERT:%.*]] = insertvalue { i16, float, i8 } [[LOAD4_FCA_1_INSERT]], i8 [[A3_SROA_4_0_A3_SROA_4_8_LOAD4_FCA_2_LOAD]], 2
 ; CHECK-NEXT:    [[UNWRAP2:%.*]] = extractvalue { i16, float, i8 } [[LOAD4_FCA_2_INSERT]], 1
 ; CHECK-NEXT:    [[VALCAST2:%.*]] = bitcast float [[UNWRAP2]] to i32
@@ -492,11 +492,11 @@ define i32 @split_store_with_tbaa_struct(i32 %x, ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[I_2:%.*]] = insertvalue { i16, float, i8 } [[I_1]], float 3.000000e+00, 1
 ; CHECK-NEXT:    [[I_3:%.*]] = insertvalue { i16, float, i8 } [[I_2]], i8 99, 2
 ; CHECK-NEXT:    [[I_3_FCA_0_EXTRACT:%.*]] = extractvalue { i16, float, i8 } [[I_3]], 0
-; CHECK-NEXT:    store i16 [[I_3_FCA_0_EXTRACT]], ptr [[A3_SROA_0]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    store i16 [[I_3_FCA_0_EXTRACT]], ptr [[A3_SROA_0]], align 8, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[I_3_FCA_1_EXTRACT:%.*]] = extractvalue { i16, float, i8 } [[I_3]], 1
-; CHECK-NEXT:    store float [[I_3_FCA_1_EXTRACT]], ptr [[A3_SROA_33]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT:    store float [[I_3_FCA_1_EXTRACT]], ptr [[A3_SROA_33]], align 4, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[I_3_FCA_2_EXTRACT:%.*]] = extractvalue { i16, float, i8 } [[I_3]], 2
-; CHECK-NEXT:    store i8 [[I_3_FCA_2_EXTRACT]], ptr [[A3_SROA_4]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    store i8 [[I_3_FCA_2_EXTRACT]], ptr [[A3_SROA_4]], align 8, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[A3_SROA_0_0_A3_SROA_0_0_COPYLOAD1:%.*]] = load volatile i16, ptr [[A3_SROA_0]], align 8
 ; CHECK-NEXT:    store volatile i16 [[A3_SROA_0_0_A3_SROA_0_0_COPYLOAD1]], ptr [[DST]], align 1
 ; CHECK-NEXT:    [[A3_SROA_3_0_DST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 2
@@ -552,11 +552,7 @@ declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias
 ; CHECK: [[META1]] = !{!"float", [[META2:![0-9]+]], i64 0}
 ; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
 ; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
-; CHECK: [[TBAA_STRUCT4]] = !{i64 0, i64 4, [[TBAA0]]}
-; CHECK: [[TBAA_STRUCT5]] = !{i64 0, i64 4, [[TBAA0]], i64 4, i64 4, [[TBAA0]]}
-; CHECK: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-; CHECK: [[META7]] = !{!"v2f32", [[META2]], i64 0}
-; CHECK: [[TBAA_STRUCT8]] = !{i64 0, i64 2, [[TBAA0]], i64 2, i64 6, [[TBAA0]]}
-; CHECK: [[TBAA_STRUCT9]] = !{i64 0, i64 3, [[TBAA0]]}
-; CHECK: [[TBAA_STRUCT10]] = !{i64 0, i64 4, [[TBAA6]]}
+; CHECK: [[TBAA_STRUCT4]] = !{i64 0, i64 4, [[TBAA0]], i64 4, i64 4, [[TBAA0]]}
+; CHECK: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK: [[META6]] = !{!"v2f32", [[META2]], i64 0}
 ;.

>From 91450f1b57b34034376662dae5452af8c992c103 Mon Sep 17 00:00:00 2001
From: PeterChou1 <peter.chou at mail.utoronto.ca>
Date: Thu, 25 Jul 2024 03:33:02 -0400
Subject: [PATCH 103/259] [clang-doc] switched from using relative to absolute
 paths (#93281)

fixes https://github.com/llvm/llvm-project/issues/92867

This patches changes the way clang-doc index navigation works,
previously it was based a relative path approach, this approach is error
prone and lead to wrong paths for the anchor tag. The new navigation way
is based on absolute paths and should work and be less confusing
codewise.
Because the differences with serving over a http server and viewing via
file system I also added export a RootPath variable to the index_json.js
file
---
 clang-tools-extra/clang-doc/HTMLGenerator.cpp | 13 +++++
 clang-tools-extra/clang-doc/assets/index.js   | 51 +++++--------------
 .../test/clang-doc/test-path-abs.cpp          |  6 +++
 3 files changed, 32 insertions(+), 38 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-doc/test-path-abs.cpp

diff --git a/clang-tools-extra/clang-doc/HTMLGenerator.cpp b/clang-tools-extra/clang-doc/HTMLGenerator.cpp
index f6b5e8926f903..aef22453035c3 100644
--- a/clang-tools-extra/clang-doc/HTMLGenerator.cpp
+++ b/clang-tools-extra/clang-doc/HTMLGenerator.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 #include <optional>
 #include <string>
 
@@ -979,6 +980,18 @@ static llvm::Error serializeIndex(ClangDocContext &CDCtx) {
                                    "error creating index file: " +
                                        FileErr.message());
   }
+  llvm::SmallString<128> RootPath(CDCtx.OutDirectory);
+  if (llvm::sys::path::is_relative(RootPath)) {
+    llvm::sys::fs::make_absolute(RootPath);
+  }
+  // Replace the escaped characters with a forward slash. It shouldn't matter
+  // when rendering the webpage in a web browser. This helps to prevent the
+  // JavaScript from escaping characters incorrectly, and introducing  bad paths
+  // in the URLs.
+  std::string RootPathEscaped = RootPath.str().str();
+  std::replace(RootPathEscaped.begin(), RootPathEscaped.end(), '\\', '/');
+  OS << "var RootPath = \"" << RootPathEscaped << "\";\n";
+
   CDCtx.Idx.sort();
   llvm::json::OStream J(OS, 2);
   std::function<void(Index)> IndexToJSON = [&](const Index &I) {
diff --git a/clang-tools-extra/clang-doc/assets/index.js b/clang-tools-extra/clang-doc/assets/index.js
index 49818763a4393..6a223de66f84a 100644
--- a/clang-tools-extra/clang-doc/assets/index.js
+++ b/clang-tools-extra/clang-doc/assets/index.js
@@ -1,42 +1,17 @@
-// Append using posix-style a file name or directory to Base
-function append(Base, New) {
-  if (!New)
-    return Base;
-  if (Base)
-    Base += "/";
-  Base += New;
-  return Base;
-}
-
-// Get relative path to access FilePath from CurrentDirectory
-function computeRelativePath(FilePath, CurrentDirectory) {
-  var Path = FilePath;
-  while (Path) {
-    if (CurrentDirectory == Path)
-      return FilePath.substring(Path.length + 1);
-    Path = Path.substring(0, Path.lastIndexOf("/"));
-  }
-
-  var Dir = CurrentDirectory;
-  var Result = "";
-  while (Dir) {
-    if (Dir == FilePath)
-      break;
-    Dir = Dir.substring(0, Dir.lastIndexOf("/"));
-    Result = append(Result, "..")
+function genLink(Ref) {
+  // we treat the file paths different depending on if we're
+  // serving via a http server or viewing from a local
+  var Path = window.location.protocol.startsWith("file") ?
+      `${window.location.protocol}//${window.location.host}/${Ref.Path}` :
+      `${window.location.protocol}//${RootPath}/${Ref.Path}`;
+  if (Ref.RefType === "namespace") {
+    Path = `${Path}/index.html`
+  } else if (Ref.Path === "") {
+      Path = `${Path}${Ref.Name}.html`;
+  } else {
+    Path = `${Path}/${Ref.Name}.html`;
   }
-  Result = append(Result, FilePath.substring(Dir.length))
-  return Result;
-}
-
-function genLink(Ref, CurrentDirectory) {
-  var Path = computeRelativePath(Ref.Path, CurrentDirectory);
-  if (Ref.RefType == "namespace")
-    Path = append(Path, "index.html");
-  else
-    Path = append(Path, Ref.Name + ".html")
-
-    ANode = document.createElement("a");
+  ANode = document.createElement("a");
   ANode.setAttribute("href", Path);
   var TextNode = document.createTextNode(Ref.Name);
   ANode.appendChild(TextNode);
diff --git a/clang-tools-extra/test/clang-doc/test-path-abs.cpp b/clang-tools-extra/test/clang-doc/test-path-abs.cpp
new file mode 100644
index 0000000000000..f6cce95bbea0c
--- /dev/null
+++ b/clang-tools-extra/test/clang-doc/test-path-abs.cpp
@@ -0,0 +1,6 @@
+// RUN: rm -rf %t && mkdir %t
+// RUN: clang-doc --format=html --executor=standalone %s --output=%t
+// RUN: FileCheck %s -input-file=%t/index_json.js  -check-prefix=JSON-INDEX
+// RUN: rm -rf %t
+
+// JSON-INDEX: var RootPath = "{{.*}}test-path-abs.cpp.tmp";
\ No newline at end of file

>From dc1c00f6b13f724154f9883990f8b21fb8dcccef Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Thu, 25 Jul 2024 09:03:48 +0100
Subject: [PATCH 104/259] [StackFrameLayoutAnalysis] Use target-specific hook
 for SP offsets (#100386)

StackFrameLayoutAnalysis currently calculates SP-relative offsets in a
target-independent way via MachineFrameInfo offsets. This is incorrect
for some Targets, e.g. AArch64, when there are scalable vector stack
slots.

This patch adds a virtual function to TargetFrameLowering to provide
offsets from SP, with a default implementation matching what is
currently used in StackFrameLayoutAnalysis, and refactors
StackFrameLayoutAnalysis to use this function. Only non-zero scalable
offsets are output by the analysis pass.

An implementation of this function is added for AArch64 targets, which
aims to provide correct SP offsets in most cases.
---
 .../llvm/CodeGen/TargetFrameLowering.h        |   7 +
 .../CodeGen/StackFrameLayoutAnalysisPass.cpp  |  51 +-
 llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp  |  14 +
 .../Target/AArch64/AArch64FrameLowering.cpp   |  35 ++
 .../lib/Target/AArch64/AArch64FrameLowering.h |   2 +
 .../CodeGen/AArch64/sve-stack-frame-layout.ll | 480 +++++++++++++++++-
 6 files changed, 563 insertions(+), 26 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index 72978b2f746d7..0656c0d739fdf 100644
--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -343,6 +343,13 @@ class TargetFrameLowering {
     return getFrameIndexReference(MF, FI, FrameReg);
   }
 
+  /// getFrameIndexReferenceFromSP - This method returns the offset from the
+  /// stack pointer to the slot of the specified index. This function serves to
+  /// provide a comparable offset from a single reference point (the value of
+  /// the stack-pointer at function entry) that can be used for analysis.
+  virtual StackOffset getFrameIndexReferenceFromSP(const MachineFunction &MF,
+                                                   int FI) const;
+
   /// Returns the callee-saved registers as computed by determineCalleeSaves
   /// in the BitVector \p SavedRegs.
   virtual void getCalleeSaves(const MachineFunction &MF,
diff --git a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
index 940aecd1cb363..ff77685f8f354 100644
--- a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
+++ b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
@@ -60,15 +60,15 @@ struct StackFrameLayoutAnalysisPass : public MachineFunctionPass {
     int Slot;
     int Size;
     int Align;
-    int Offset;
+    StackOffset Offset;
     SlotType SlotTy;
     bool Scalable;
 
-    SlotData(const MachineFrameInfo &MFI, const int ValOffset, const int Idx)
+    SlotData(const MachineFrameInfo &MFI, const StackOffset Offset,
+             const int Idx)
         : Slot(Idx), Size(MFI.getObjectSize(Idx)),
-          Align(MFI.getObjectAlign(Idx).value()),
-          Offset(MFI.getObjectOffset(Idx) - ValOffset), SlotTy(Invalid),
-          Scalable(false) {
+          Align(MFI.getObjectAlign(Idx).value()), Offset(Offset),
+          SlotTy(Invalid), Scalable(false) {
       Scalable = MFI.getStackID(Idx) == TargetStackID::ScalableVector;
       if (MFI.isSpillSlotObjectIndex(Idx))
         SlotTy = SlotType::Spill;
@@ -79,10 +79,10 @@ struct StackFrameLayoutAnalysisPass : public MachineFunctionPass {
     }
 
     // We use this to sort in reverse order, so that the layout is displayed
-    // correctly. Scalable slots are sorted to the end of the list.
+    // correctly.
     bool operator<(const SlotData &Rhs) const {
-      return std::make_tuple(!Scalable, Offset) >
-             std::make_tuple(!Rhs.Scalable, Rhs.Offset);
+      return (Offset.getFixed() + Offset.getScalable()) >
+             (Rhs.Offset.getFixed() + Rhs.Offset.getScalable());
     }
   };
 
@@ -149,15 +149,27 @@ struct StackFrameLayoutAnalysisPass : public MachineFunctionPass {
     // For example we store the Offset in YAML as:
     //    ...
     //    - Offset: -8
+    //    - ScalableOffset: -16
+    // Note: the ScalableOffset entries are added only for slots with non-zero
+    // scalable offsets.
     //
-    // But we print it to the CLI as
+    // But we print it to the CLI as:
     //   Offset: [SP-8]
+    //
+    // Or with non-zero scalable offset:
+    //   Offset: [SP-8-16 x vscale]
 
     // Negative offsets will print a leading `-`, so only add `+`
     std::string Prefix =
-        formatv("\nOffset: [SP{0}", (D.Offset < 0) ? "" : "+").str();
-    Rem << Prefix << ore::NV("Offset", D.Offset)
-        << "], Type: " << ore::NV("Type", getTypeString(D.SlotTy))
+        formatv("\nOffset: [SP{0}", (D.Offset.getFixed() < 0) ? "" : "+").str();
+    Rem << Prefix << ore::NV("Offset", D.Offset.getFixed());
+
+    if (D.Offset.getScalable()) {
+      Rem << ((D.Offset.getScalable() < 0) ? "" : "+")
+          << ore::NV("ScalableOffset", D.Offset.getScalable()) << " x vscale";
+    }
+
+    Rem << "], Type: " << ore::NV("Type", getTypeString(D.SlotTy))
         << ", Align: " << ore::NV("Align", D.Align)
         << ", Size: " << ore::NV("Size", ElementCount::get(D.Size, D.Scalable));
   }
@@ -170,17 +182,22 @@ struct StackFrameLayoutAnalysisPass : public MachineFunctionPass {
     Rem << "\n    " << ore::NV("DataLoc", Loc);
   }
 
+  StackOffset getStackOffset(const MachineFunction &MF,
+                             const MachineFrameInfo &MFI,
+                             const TargetFrameLowering *FI, int FrameIdx) {
+    if (!FI)
+      return StackOffset::getFixed(MFI.getObjectOffset(FrameIdx));
+
+    return FI->getFrameIndexReferenceFromSP(MF, FrameIdx);
+  }
+
   void emitStackFrameLayoutRemarks(MachineFunction &MF,
                                    MachineOptimizationRemarkAnalysis &Rem) {
     const MachineFrameInfo &MFI = MF.getFrameInfo();
     if (!MFI.hasStackObjects())
       return;
 
-    // ValOffset is the offset to the local area from the SP at function entry.
-    // To display the true offset from SP, we need to subtract ValOffset from
-    // MFI's ObjectOffset.
     const TargetFrameLowering *FI = MF.getSubtarget().getFrameLowering();
-    const int ValOffset = (FI ? FI->getOffsetOfLocalArea() : 0);
 
     LLVM_DEBUG(dbgs() << "getStackProtectorIndex =="
                       << MFI.getStackProtectorIndex() << "\n");
@@ -194,7 +211,7 @@ struct StackFrameLayoutAnalysisPass : public MachineFunctionPass {
          Idx != EndIdx; ++Idx) {
       if (MFI.isDeadObjectIndex(Idx))
         continue;
-      SlotInfo.emplace_back(MFI, ValOffset, Idx);
+      SlotInfo.emplace_back(MFI, getStackOffset(MF, MFI, FI, Idx), Idx);
     }
 
     // sort the ordering, to match the actual layout in memory
diff --git a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
index 48a2094f5d451..7d054cb7c7c71 100644
--- a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -61,6 +61,20 @@ TargetFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
                                MFI.getOffsetAdjustment());
 }
 
+/// Returns the offset from the stack pointer to the slot of the specified
+/// index. This function serves to provide a comparable offset from a single
+/// reference point (the value of the stack-pointer at function entry) that can
+/// be used for analysis. This is the default implementation using
+/// MachineFrameInfo offsets.
+StackOffset
+TargetFrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
+                                                  int FI) const {
+  // To display the true offset from SP, we need to subtract the offset to the
+  // local area from MFI's ObjectOffset.
+  return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI) -
+                               getOffsetOfLocalArea());
+}
+
 bool TargetFrameLowering::needsFrameIndexResolution(
     const MachineFunction &MF) const {
   return MF.getFrameInfo().hasStackObjects();
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index b1b83e27c5592..bd530903bb664 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2603,6 +2603,41 @@ AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
       /*ForSimm=*/false);
 }
 
+StackOffset
+AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
+                                                   int FI) const {
+  // This function serves to provide a comparable offset from a single reference
+  // point (the value of SP at function entry) that can be used for analysis,
+  // e.g. the stack-frame-layout analysis pass. It is not guaranteed to be
+  // correct for all objects in the presence of VLA-area objects or dynamic
+  // stack re-alignment.
+
+  const auto &MFI = MF.getFrameInfo();
+
+  int64_t ObjectOffset = MFI.getObjectOffset(FI);
+
+  // This is correct in the absence of any SVE stack objects.
+  StackOffset SVEStackSize = getSVEStackSize(MF);
+  if (!SVEStackSize)
+    return StackOffset::getFixed(ObjectOffset - getOffsetOfLocalArea());
+
+  const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
+  if (MFI.getStackID(FI) == TargetStackID::ScalableVector) {
+    return StackOffset::get(-((int64_t)AFI->getCalleeSavedStackSize()),
+                            ObjectOffset);
+  }
+
+  bool IsFixed = MFI.isFixedObjectIndex(FI);
+  bool IsCSR =
+      !IsFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
+
+  StackOffset ScalableOffset = {};
+  if (!IsFixed && !IsCSR)
+    ScalableOffset = -SVEStackSize;
+
+  return StackOffset::getFixed(ObjectOffset) + ScalableOffset;
+}
+
 StackOffset
 AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
                                                      int FI) const {
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index da315850d6362..0ebab1700e9ce 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -41,6 +41,8 @@ class AArch64FrameLowering : public TargetFrameLowering {
 
   StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
                                      Register &FrameReg) const override;
+  StackOffset getFrameIndexReferenceFromSP(const MachineFunction &MF,
+                                           int FI) const override;
   StackOffset resolveFrameIndexReference(const MachineFunction &MF, int FI,
                                          Register &FrameReg, bool PreferFP,
                                          bool ForSimm) const;
diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
index 34d85d1f76086..36bca2ebd4ada 100644
--- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
@@ -5,9 +5,9 @@
 ; CHECK-FRAMELAYOUT-LABEL: Function: csr_d8_allocnxv4i32i32f64
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-20], Type: Variable, Align: 4, Size: 4
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Variable, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-16 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-20-16 x vscale], Type: Variable, Align: 4, Size: 4
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Variable, Align: 8, Size: 8
 
 define i32 @csr_d8_allocnxv4i32i32f64(double %d) "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: csr_d8_allocnxv4i32i32f64:
@@ -49,8 +49,8 @@ entry:
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-20], Type: Variable, Align: 4, Size: 4
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 16, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-40], Type: Variable, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-40-16 x vscale], Type: Variable, Align: 8, Size: 8
 
 define i32 @csr_d8_allocnxv4i32i32f64_fp(double %d) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
 ; CHECK-LABEL: csr_d8_allocnxv4i32i32f64_fp:
@@ -90,13 +90,167 @@ entry:
   ret i32 0
 }
 
+; In the presence of dynamic stack-realignment we emit correct offsets for
+; objects which are not realigned. For realigned objects, e.g. the i32 alloca
+; in this test, we emit the correct offset ignoring the re-alignment (i.e. the
+; offset if the alignment requirement is already satisfied).
+
+; CHECK-FRAMELAYOUT-LABEL: Function: csr_d8_allocnxv4i32i32f64_dynamicrealign
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-128-16 x vscale], Type: Variable, Align: 128, Size: 4
+
+define i32 @csr_d8_allocnxv4i32i32f64_dynamicrealign(double %d) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: csr_d8_allocnxv4i32i32f64_dynamicrealign:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str d8, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub x9, sp, #96
+; CHECK-NEXT:    stp x29, x30, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #16
+; CHECK-NEXT:    addvl x9, x9, #-1
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffff80
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_offset b8, -32
+; CHECK-NEXT:    mov z1.s, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    sub x8, x29, #16
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str wzr, [sp]
+; CHECK-NEXT:    stur d0, [x29, #-8]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x8, #-1, mul vl]
+; CHECK-NEXT:    sub sp, x29, #16
+; CHECK-NEXT:    ldp x29, x30, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr d8, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %a = alloca <vscale x 4 x i32>
+  %b = alloca i32, align 128
+  %c = alloca double
+  tail call void asm sideeffect "", "~{d8}"() #1
+  store <vscale x 4 x i32> zeroinitializer, ptr %a
+  store i32 zeroinitializer, ptr %b
+  store double %d, ptr %c
+  ret i32 0
+}
+
+; In the presence of VLA-area objects, we emit correct offsets for all objects
+; except for these VLA objects.
+
+; CHECK-FRAMELAYOUT-LABEL: Function: csr_d8_allocnxv4i32i32f64_vla
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Variable, Align: 1, Size: 0
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-40-16 x vscale], Type: Variable, Align: 8, Size: 8
+
+define i32 @csr_d8_allocnxv4i32i32f64_vla(double %d, i32 %i) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: csr_d8_allocnxv4i32i32f64_vla:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str d8, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #8] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #8
+; CHECK-NEXT:    str x19, [sp, #24] // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 24
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    .cfi_offset w29, -24
+; CHECK-NEXT:    .cfi_offset b8, -32
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    ubfiz x8, x0, #2, #32
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    add x8, x8, #15
+; CHECK-NEXT:    and x8, x8, #0x7fffffff0
+; CHECK-NEXT:    sub x8, x9, x8
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    mov z1.s, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str wzr, [x8]
+; CHECK-NEXT:    sub x8, x29, #8
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    str d0, [x19, #8]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x8, #-1, mul vl]
+; CHECK-NEXT:    sub sp, x29, #8
+; CHECK-NEXT:    ldp x29, x30, [sp, #8] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr d8, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %a = alloca <vscale x 4 x i32>
+  %0 = zext i32 %i to i64
+  %b = alloca i32, i64 %0
+  %c = alloca double
+  tail call void asm sideeffect "", "~{d8}"() #1
+  store <vscale x 4 x i32> zeroinitializer, ptr %a
+  store i32 zeroinitializer, ptr %b
+  store double %d, ptr %c
+  ret i32 0
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: csr_d8_allocnxv4i32i32f64_stackargsi32f64
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP+8], Type: Variable, Align: 8, Size: 4
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP+0], Type: Protector, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-16 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-20-16 x vscale], Type: Variable, Align: 4, Size: 4
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Variable, Align: 8, Size: 8
+
+define i32 @csr_d8_allocnxv4i32i32f64_stackargsi32f64(double %d0, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: csr_d8_allocnxv4i32i32f64_stackargsi32f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str d8, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -8
+; CHECK-NEXT:    .cfi_offset b8, -16
+; CHECK-NEXT:    mov z1.s, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str wzr, [sp, #12]
+; CHECK-NEXT:    str d0, [sp]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldr x29, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr d8, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %a = alloca <vscale x 4 x i32>
+  %b = alloca i32
+  %c = alloca double
+  tail call void asm sideeffect "", "~{d8}"() #1
+  store <vscale x 4 x i32> zeroinitializer, ptr %a
+  store i32 zeroinitializer, ptr %b
+  store double %d0, ptr %c
+  ret i32 0
+}
+
 ; CHECK-FRAMELAYOUT-LABEL: Function: svecc_z8_allocnxv4i32i32f64_fp
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-20], Type: Variable, Align: 4, Size: 4
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Variable, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: vscale x 16
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-16 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-32 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-20-32 x vscale], Type: Variable, Align: 4, Size: 4
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-32 x vscale], Type: Variable, Align: 8, Size: 8
 
 define i32 @svecc_z8_allocnxv4i32i32f64_fp(double %d, <vscale x 4 x i32> %v) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
 ; CHECK-LABEL: svecc_z8_allocnxv4i32i32f64_fp:
@@ -133,3 +287,311 @@ entry:
   store double %d, ptr %c
   ret i32 0
 }
+
+; CHECK-FRAMELAYOUT-LABEL: Function: svecc_z8_allocnxv4i32i32f64_stackargsi32_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP+0], Type: Protector, Align: 16, Size: 4
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-16 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-32 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-20-32 x vscale], Type: Variable, Align: 4, Size: 4
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-32 x vscale], Type: Variable, Align: 8, Size: 8
+
+define i32 @svecc_z8_allocnxv4i32i32f64_stackargsi32_fp(double %d, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, <vscale x 4 x i32> %v) "aarch64_pstate_sm_compatible" "frame-pointer"="all"{
+; CHECK-LABEL: svecc_z8_allocnxv4i32i32f64_stackargsi32_fp:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str wzr, [sp, #12]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x29, #-2, mul vl]
+; CHECK-NEXT:    str d0, [sp], #16
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %a = alloca <vscale x 4 x i32>
+  %b = alloca i32
+  %c = alloca double
+  tail call void asm sideeffect "", "~{d8}"() #1
+  store <vscale x 4 x i32> %v, ptr %a
+  store i32 zeroinitializer, ptr %b
+  store double %d, ptr %c
+  ret i32 0
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: svecc_call
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-40], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-16 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-32 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-48 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-258 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-260 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-262 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-264 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-266 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-268 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-270 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-272 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-274 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-276 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-278 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-280 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+
+define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: svecc_call:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x27, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w27, -16
+; CHECK-NEXT:    .cfi_offset w28, -24
+; CHECK-NEXT:    .cfi_offset w30, -40
+; CHECK-NEXT:    .cfi_offset w29, -48
+; CHECK-NEXT:    addvl sp, sp, #-18
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 144 * VG
+; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 48 - 32 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 48 - 40 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 48 - 48 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 48 - 56 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    .cfi_offset vg, -32
+; CHECK-NEXT:    tbz w19, #0, .LBB7_2
+; CHECK-NEXT:  // %bb.1: // %entry
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:  .LBB7_2: // %entry
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    mov w1, #45 // =0x2d
+; CHECK-NEXT:    mov w2, #37 // =0x25
+; CHECK-NEXT:    bl memset
+; CHECK-NEXT:    tbz w19, #0, .LBB7_4
+; CHECK-NEXT:  // %bb.3: // %entry
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:  .LBB7_4: // %entry
+; CHECK-NEXT:    mov w0, #22647 // =0x5877
+; CHECK-NEXT:    movk w0, #59491, lsl #16
+; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #18
+; CHECK-NEXT:    .cfi_def_cfa wsp, 48
+; CHECK-NEXT:    .cfi_restore z8
+; CHECK-NEXT:    .cfi_restore z9
+; CHECK-NEXT:    .cfi_restore z10
+; CHECK-NEXT:    .cfi_restore z11
+; CHECK-NEXT:    .cfi_restore z12
+; CHECK-NEXT:    .cfi_restore z13
+; CHECK-NEXT:    .cfi_restore z14
+; CHECK-NEXT:    .cfi_restore z15
+; CHECK-NEXT:    ldp x27, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x28, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #48 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore w27
+; CHECK-NEXT:    .cfi_restore w28
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
+  %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37)
+  ret i32 -396142473
+}
+declare ptr @memset(ptr, i32, i32)
+
+; The VA register currently ends up in VLA space - in the presence of VLA-area
+; objects, we emit correct offsets for all objects except for these VLA objects.
+
+; CHECK-FRAMELAYOUT-LABEL: Function: vastate
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-40], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-56], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-72], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-80], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-88], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-96], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-104], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-112], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-128], Type: Variable, Align: 16, Size: 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-128], Type: Variable, Align: 16, Size: 0
+
+define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "target-features"="+sme" {
+; CHECK-LABEL: vastate:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #64
+; CHECK-NEXT:    .cfi_def_cfa w29, 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -40
+; CHECK-NEXT:    .cfi_offset w29, -48
+; CHECK-NEXT:    .cfi_offset b8, -56
+; CHECK-NEXT:    .cfi_offset b9, -64
+; CHECK-NEXT:    .cfi_offset b10, -72
+; CHECK-NEXT:    .cfi_offset b11, -80
+; CHECK-NEXT:    .cfi_offset b12, -88
+; CHECK-NEXT:    .cfi_offset b13, -96
+; CHECK-NEXT:    .cfi_offset b14, -104
+; CHECK-NEXT:    .cfi_offset b15, -112
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov w20, w0
+; CHECK-NEXT:    msub x9, x8, x8, x9
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    stur x9, [x29, #-80]
+; CHECK-NEXT:    sub x9, x29, #80
+; CHECK-NEXT:    sturh wzr, [x29, #-70]
+; CHECK-NEXT:    stur wzr, [x29, #-68]
+; CHECK-NEXT:    sturh w8, [x29, #-72]
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
+; CHECK-NEXT:    .cfi_offset vg, -32
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl other
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #80
+; CHECK-NEXT:    cbnz x8, .LBB8_2
+; CHECK-NEXT:  // %bb.1: // %entry
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB8_2: // %entry
+; CHECK-NEXT:    mov w0, w20
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    sub sp, x29, #64
+; CHECK-NEXT:    .cfi_def_cfa wsp, 112
+; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #112 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore w20
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    .cfi_restore b8
+; CHECK-NEXT:    .cfi_restore b9
+; CHECK-NEXT:    .cfi_restore b10
+; CHECK-NEXT:    .cfi_restore b11
+; CHECK-NEXT:    .cfi_restore b12
+; CHECK-NEXT:    .cfi_restore b13
+; CHECK-NEXT:    .cfi_restore b14
+; CHECK-NEXT:    .cfi_restore b15
+; CHECK-NEXT:    ret
+entry:
+  tail call void @other()
+  ret i32 %x
+}
+declare void @other()

>From f48c16631de07b47e0721b88bd8004e63897f29a Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett at linaro.org>
Date: Thu, 25 Jul 2024 09:15:03 +0100
Subject: [PATCH 105/259] [lldb][Linux] Parse, but don't store "comm" from
 /proc/stat file (#100387)

As reported in https://github.com/llvm/llvm-project/issues/89710, the %s
code used for `comm` could and probably does, overflow the buffer.
Likely we haven't seen it cause problems because the following data is
overwritten right afterwards.

Also scanf isn't a great choice here as this `comm` can include many
characters that might trip up %s.

We don't actually use `comm`, so parse but don't store it so we're not
overflowing anything.
---
 lldb/source/Host/linux/Host.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/lldb/source/Host/linux/Host.cpp b/lldb/source/Host/linux/Host.cpp
index 5545f9ef4d70e..0bc736d90ea76 100644
--- a/lldb/source/Host/linux/Host.cpp
+++ b/lldb/source/Host/linux/Host.cpp
@@ -51,11 +51,9 @@ enum class ProcessState {
   Zombie,
 };
 
-constexpr int task_comm_len = 16;
-
 struct StatFields {
   ::pid_t pid = LLDB_INVALID_PROCESS_ID;
-  char comm[task_comm_len];
+  // comm
   char state;
   ::pid_t ppid = LLDB_INVALID_PROCESS_ID;
   ::pid_t pgrp = LLDB_INVALID_PROCESS_ID;
@@ -100,8 +98,8 @@ static bool GetStatusInfo(::pid_t Pid, ProcessInstanceInfo &ProcessInfo,
   StatFields stat_fields;
   if (sscanf(
           Rest.data(),
-          "%d %s %c %d %d %d %d %d %u %lu %lu %lu %lu %lu %lu %ld %ld %ld %ld",
-          &stat_fields.pid, stat_fields.comm, &stat_fields.state,
+          "%d %*s %c %d %d %d %d %d %u %lu %lu %lu %lu %lu %lu %ld %ld %ld %ld",
+          &stat_fields.pid, /* comm, */ &stat_fields.state,
           &stat_fields.ppid, &stat_fields.pgrp, &stat_fields.session,
           &stat_fields.tty_nr, &stat_fields.tpgid, &stat_fields.flags,
           &stat_fields.minflt, &stat_fields.cminflt, &stat_fields.majflt,

>From c3a2efceb53e1e3ab9b92e5bf6518ab51f4713a9 Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail at bennani.ma>
Date: Thu, 25 Jul 2024 01:15:10 -0700
Subject: [PATCH 106/259] [lldb/Plugins] Fix build failure on windows following
 2914a4b88837

This patch tries to fix the following build failure on windows:

https://lab.llvm.org/buildbot/#/builders/141/builds/1083

This started happening following 2914a4b88837, and it seems to be caused
by some special `#include` ordering for the lldb-python header on Windows.

Signed-off-by: Med Ismail Bennani <ismail at bennani.ma>
---
 .../OperatingSystemPythonInterface.h               |  8 ++++++++
 .../ScriptedPlatformPythonInterface.cpp            | 14 +++++++++-----
 .../ScriptedProcessPythonInterface.cpp             | 13 ++++++++-----
 .../ScriptedThreadPlanPythonInterface.h            |  9 +++++++++
 4 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h
index 6d60f8b437d1c..001d49decb5b6 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h
@@ -9,6 +9,14 @@
 #ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_OPERATINGSYSTEMPYTHONINTERFACE_H
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_OPERATINGSYSTEMPYTHONINTERFACE_H
 
+#if LLDB_ENABLE_PYTHON
+
+// clang-format off
+// LLDB Python header must be included first
+#include "../../lldb-python.h"
+//clang-format on
+#endif
+
 #include "lldb/Host/Config.h"
 #include "lldb/Interpreter/Interfaces/OperatingSystemInterface.h"
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.cpp
index 3586251bd4aac..19d2ee2c717d8 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.cpp
@@ -6,6 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#if LLDB_ENABLE_PYTHON
+
+// clang-format off
+// LLDB Python header must be included first
+#include "../../lldb-python.h"
+//clang-format on
+
+#endif
+
 #include "lldb/Core/PluginManager.h"
 #include "lldb/Host/Config.h"
 #include "lldb/Target/ExecutionContext.h"
@@ -15,11 +24,6 @@
 
 #if LLDB_ENABLE_PYTHON
 
-// clang-format off
-// LLDB Python header must be included first
-#include "../../lldb-python.h"
-//clang-format on
-
 #include "../../SWIGPythonBridge.h"
 #include "../../ScriptInterpreterPythonImpl.h"
 #include "ScriptedPlatformPythonInterface.h"
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.cpp
index f4fba0848fe27..794c3e87ba97f 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.cpp
@@ -6,6 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#if LLDB_ENABLE_PYTHON
+
+// clang-format off
+// LLDB Python header must be included first
+#include "../../lldb-python.h"
+//clang-format on
+#endif
+
 #include "lldb/Core/PluginManager.h"
 #include "lldb/Host/Config.h"
 #include "lldb/Target/Process.h"
@@ -15,11 +23,6 @@
 
 #if LLDB_ENABLE_PYTHON
 
-// clang-format off
-// LLDB Python header must be included first
-#include "../../lldb-python.h"
-//clang-format on
-
 #include "../../SWIGPythonBridge.h"
 #include "../../ScriptInterpreterPythonImpl.h"
 #include "../ScriptedThreadPythonInterface.h"
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.h
index c0a82f4cbf46a..73b17a556464f 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.h
@@ -9,6 +9,15 @@
 #ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H
 
+#if LLDB_ENABLE_PYTHON
+
+// clang-format off
+// LLDB Python header must be included first
+#include "../../lldb-python.h"
+//clang-format on
+
+#endif
+
 #include "lldb/Host/Config.h"
 #include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h"
 

>From 6a5a64c56bc6c7183935367d3cf915ccdd103882 Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christianulmann at gmail.com>
Date: Thu, 25 Jul 2024 10:42:25 +0200
Subject: [PATCH 107/259] Revert "[mlir] Fix block merging" (#100510)

Reverts llvm/llvm-project#97697

This commit introduced non-trivial bugs related to type consistency.
---
 .../BufferDeallocationSimplification.cpp      |   9 +-
 mlir/lib/Transforms/Utils/RegionUtils.cpp     | 204 +-----------------
 .../dealloc-branchop-interface.mlir           |  20 +-
 .../Linalg/detensorize_entry_block.mlir       |   6 +-
 mlir/test/Dialect/Linalg/detensorize_if.mlir  |  67 +++---
 .../Dialect/Linalg/detensorize_while.mlir     |  12 +-
 .../Linalg/detensorize_while_impure_cf.mlir   |  12 +-
 .../Linalg/detensorize_while_pure_cf.mlir     |   4 +-
 .../Transforms/canonicalize-block-merge.mlir  |   6 +-
 mlir/test/Transforms/canonicalize-dce.mlir    |   8 +-
 .../Transforms/make-isolated-from-above.mlir  |  18 +-
 .../test-canonicalize-merge-large-blocks.mlir | 162 --------------
 12 files changed, 83 insertions(+), 445 deletions(-)
 delete mode 100644 mlir/test/Transforms/test-canonicalize-merge-large-blocks.mlir

diff --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp
index 5227b22653eef..954485cfede3d 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp
@@ -463,15 +463,10 @@ struct BufferDeallocationSimplificationPass
                  SplitDeallocWhenNotAliasingAnyOther,
                  RetainedMemrefAliasingAlwaysDeallocatedMemref>(&getContext(),
                                                                 analysis);
-    // We don't want that the block structure changes invalidating the
-    // `BufferOriginAnalysis` so we apply the rewrites witha `Normal` level of
-    // region simplification
-    GreedyRewriteConfig config;
-    config.enableRegionSimplification = GreedySimplifyRegionLevel::Normal;
     populateDeallocOpCanonicalizationPatterns(patterns, &getContext());
 
-    if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns),
-                                            config)))
+    if (failed(
+            applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
       signalPassFailure();
   }
 };
diff --git a/mlir/lib/Transforms/Utils/RegionUtils.cpp b/mlir/lib/Transforms/Utils/RegionUtils.cpp
index 946d65cef4186..4c0f15bafbaba 100644
--- a/mlir/lib/Transforms/Utils/RegionUtils.cpp
+++ b/mlir/lib/Transforms/Utils/RegionUtils.cpp
@@ -9,7 +9,6 @@
 #include "mlir/Transforms/RegionUtils.h"
 #include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/IR/Block.h"
-#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
@@ -17,15 +16,11 @@
 #include "mlir/IR/Value.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
-#include "mlir/Support/LogicalResult.h"
 
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
 
 #include <deque>
-#include <iterator>
 
 using namespace mlir;
 
@@ -679,91 +674,6 @@ static bool ableToUpdatePredOperands(Block *block) {
   return true;
 }
 
-/// Prunes the redundant list of arguments. E.g., if we are passing an argument
-/// list like [x, y, z, x] this would return [x, y, z] and it would update the
-/// `block` (to whom the argument are passed to) accordingly.
-static SmallVector<SmallVector<Value, 8>, 2> pruneRedundantArguments(
-    const SmallVector<SmallVector<Value, 8>, 2> &newArguments,
-    RewriterBase &rewriter, Block *block) {
-
-  SmallVector<SmallVector<Value, 8>, 2> newArgumentsPruned(
-      newArguments.size(), SmallVector<Value, 8>());
-
-  if (newArguments.empty())
-    return newArguments;
-
-  // `newArguments` is a 2D array of size `numLists` x `numArgs`
-  unsigned numLists = newArguments.size();
-  unsigned numArgs = newArguments[0].size();
-
-  // Map that for each arg index contains the index that we can use in place of
-  // the original index. E.g., if we have newArgs = [x, y, z, x], we will have
-  // idxToReplacement[3] = 0
-  llvm::DenseMap<unsigned, unsigned> idxToReplacement;
-
-  // This is a useful data structure to track the first appearance of a Value
-  // on a given list of arguments
-  DenseMap<Value, unsigned> firstValueToIdx;
-  for (unsigned j = 0; j < numArgs; ++j) {
-    Value newArg = newArguments[0][j];
-    if (!firstValueToIdx.contains(newArg))
-      firstValueToIdx[newArg] = j;
-  }
-
-  // Go through the first list of arguments (list 0).
-  for (unsigned j = 0; j < numArgs; ++j) {
-    bool shouldReplaceJ = false;
-    unsigned replacement = 0;
-    // Look back to see if there are possible redundancies in list 0. Please
-    // note that we are using a map to annotate when an argument was seen first
-    // to avoid a O(N^2) algorithm. This has the drawback that if we have two
-    // lists like:
-    // list0: [%a, %a, %a]
-    // list1: [%c, %b, %b]
-    // We cannot simplify it, because firstVlaueToIdx[%a] = 0, but we cannot
-    // point list1[1](==%b) or list1[2](==%b) to list1[0](==%c).  However, since
-    // the number of arguments can be potentially unbounded we cannot afford a
-    // O(N^2) algorithm (to search to all the possible pairs) and we need to
-    // accept the trade-off.
-    unsigned k = firstValueToIdx[newArguments[0][j]];
-    if (k != j) {
-      shouldReplaceJ = true;
-      replacement = k;
-      // If a possible redundancy is found, then scan the other lists: we
-      // can prune the arguments if and only if they are redundant in every
-      // list.
-      for (unsigned i = 1; i < numLists; ++i)
-        shouldReplaceJ =
-            shouldReplaceJ && (newArguments[i][k] == newArguments[i][j]);
-    }
-    // Save the replacement.
-    if (shouldReplaceJ)
-      idxToReplacement[j] = replacement;
-  }
-
-  // Populate the pruned argument list.
-  for (unsigned i = 0; i < numLists; ++i)
-    for (unsigned j = 0; j < numArgs; ++j)
-      if (!idxToReplacement.contains(j))
-        newArgumentsPruned[i].push_back(newArguments[i][j]);
-
-  // Replace the block's redundant arguments.
-  SmallVector<unsigned> toErase;
-  for (auto [idx, arg] : llvm::enumerate(block->getArguments())) {
-    if (idxToReplacement.contains(idx)) {
-      Value oldArg = block->getArgument(idx);
-      Value newArg = block->getArgument(idxToReplacement[idx]);
-      rewriter.replaceAllUsesWith(oldArg, newArg);
-      toErase.push_back(idx);
-    }
-  }
-
-  // Erase the block's redundant arguments.
-  for (unsigned idxToErase : llvm::reverse(toErase))
-    block->eraseArgument(idxToErase);
-  return newArgumentsPruned;
-}
-
 LogicalResult BlockMergeCluster::merge(RewriterBase &rewriter) {
   // Don't consider clusters that don't have blocks to merge.
   if (blocksToMerge.empty())
@@ -812,10 +722,6 @@ LogicalResult BlockMergeCluster::merge(RewriterBase &rewriter) {
         }
       }
     }
-
-    // Prune redundant arguments and update the leader block argument list
-    newArguments = pruneRedundantArguments(newArguments, rewriter, leaderBlock);
-
     // Update the predecessors for each of the blocks.
     auto updatePredecessors = [&](Block *block, unsigned clusterIndex) {
       for (auto predIt = block->pred_begin(), predE = block->pred_end();
@@ -912,108 +818,6 @@ static LogicalResult mergeIdenticalBlocks(RewriterBase &rewriter,
   return success(anyChanged);
 }
 
-static LogicalResult dropRedundantArguments(RewriterBase &rewriter,
-                                            Block &block) {
-  SmallVector<size_t> argsToErase;
-
-  // Go through the arguments of the block.
-  for (auto [argIdx, blockOperand] : llvm::enumerate(block.getArguments())) {
-    bool sameArg = true;
-    Value commonValue;
-
-    // Go through the block predecessor and flag if they pass to the block
-    // different values for the same argument.
-    for (auto predIt = block.pred_begin(), predE = block.pred_end();
-         predIt != predE; ++predIt) {
-      auto branch = dyn_cast<BranchOpInterface>((*predIt)->getTerminator());
-      if (!branch) {
-        sameArg = false;
-        break;
-      }
-      unsigned succIndex = predIt.getSuccessorIndex();
-      SuccessorOperands succOperands = branch.getSuccessorOperands(succIndex);
-      auto branchOperands = succOperands.getForwardedOperands();
-      if (!commonValue) {
-        commonValue = branchOperands[argIdx];
-      } else {
-        if (branchOperands[argIdx] != commonValue) {
-          sameArg = false;
-          break;
-        }
-      }
-    }
-
-    // If they are passing the same value, drop the argument.
-    if (commonValue && sameArg) {
-      argsToErase.push_back(argIdx);
-
-      // Remove the argument from the block.
-      rewriter.replaceAllUsesWith(blockOperand, commonValue);
-    }
-  }
-
-  // Remove the arguments.
-  for (auto argIdx : llvm::reverse(argsToErase)) {
-    block.eraseArgument(argIdx);
-
-    // Remove the argument from the branch ops.
-    for (auto predIt = block.pred_begin(), predE = block.pred_end();
-         predIt != predE; ++predIt) {
-      auto branch = cast<BranchOpInterface>((*predIt)->getTerminator());
-      unsigned succIndex = predIt.getSuccessorIndex();
-      SuccessorOperands succOperands = branch.getSuccessorOperands(succIndex);
-      succOperands.erase(argIdx);
-    }
-  }
-  return success(!argsToErase.empty());
-}
-
-/// This optimization drops redundant argument to blocks. I.e., if a given
-/// argument to a block receives the same value from each of the block
-/// predecessors, we can remove the argument from the block and use directly the
-/// original value. This is a simple example:
-///
-/// %cond = llvm.call @rand() : () -> i1
-/// %val0 = llvm.mlir.constant(1 : i64) : i64
-/// %val1 = llvm.mlir.constant(2 : i64) : i64
-/// %val2 = llvm.mlir.constant(3 : i64) : i64
-/// llvm.cond_br %cond, ^bb1(%val0 : i64, %val1 : i64), ^bb2(%val0 : i64, %val2
-/// : i64)
-///
-/// ^bb1(%arg0 : i64, %arg1 : i64):
-///    llvm.call @foo(%arg0, %arg1)
-///
-/// The previous IR can be rewritten as:
-/// %cond = llvm.call @rand() : () -> i1
-/// %val0 = llvm.mlir.constant(1 : i64) : i64
-/// %val1 = llvm.mlir.constant(2 : i64) : i64
-/// %val2 = llvm.mlir.constant(3 : i64) : i64
-/// llvm.cond_br %cond, ^bb1(%val1 : i64), ^bb2(%val2 : i64)
-///
-/// ^bb1(%arg0 : i64):
-///    llvm.call @foo(%val0, %arg0)
-///
-static LogicalResult dropRedundantArguments(RewriterBase &rewriter,
-                                            MutableArrayRef<Region> regions) {
-  llvm::SmallSetVector<Region *, 1> worklist;
-  for (Region &region : regions)
-    worklist.insert(&region);
-  bool anyChanged = false;
-  while (!worklist.empty()) {
-    Region *region = worklist.pop_back_val();
-
-    // Add any nested regions to the worklist.
-    for (Block &block : *region) {
-      anyChanged = succeeded(dropRedundantArguments(rewriter, block));
-
-      for (Operation &op : block)
-        for (Region &nestedRegion : op.getRegions())
-          worklist.insert(&nestedRegion);
-    }
-  }
-  return success(anyChanged);
-}
-
 //===----------------------------------------------------------------------===//
 // Region Simplification
 //===----------------------------------------------------------------------===//
@@ -1028,12 +832,8 @@ LogicalResult mlir::simplifyRegions(RewriterBase &rewriter,
   bool eliminatedBlocks = succeeded(eraseUnreachableBlocks(rewriter, regions));
   bool eliminatedOpsOrArgs = succeeded(runRegionDCE(rewriter, regions));
   bool mergedIdenticalBlocks = false;
-  bool droppedRedundantArguments = false;
-  if (mergeBlocks) {
+  if (mergeBlocks)
     mergedIdenticalBlocks = succeeded(mergeIdenticalBlocks(rewriter, regions));
-    droppedRedundantArguments =
-        succeeded(dropRedundantArguments(rewriter, regions));
-  }
   return success(eliminatedBlocks || eliminatedOpsOrArgs ||
-                 mergedIdenticalBlocks || droppedRedundantArguments);
+                 mergedIdenticalBlocks);
 }
diff --git a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-branchop-interface.mlir b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-branchop-interface.mlir
index 8e14990502143..5e8104f83cc4d 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-branchop-interface.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-branchop-interface.mlir
@@ -178,7 +178,7 @@ func.func @condBranchDynamicTypeNested(
 //  CHECK-NEXT: ^bb1
 //   CHECK-NOT: bufferization.dealloc
 //   CHECK-NOT: bufferization.clone
-//       CHECK: cf.br ^bb6([[ARG1]], %false{{[0-9_]*}} :
+//       CHECK: cf.br ^bb5([[ARG1]], %false{{[0-9_]*}} :
 //       CHECK: ^bb2([[IDX:%.*]]:{{.*}})
 //       CHECK: [[ALLOC1:%.*]] = memref.alloc([[IDX]])
 //  CHECK-NEXT: test.buffer_based
@@ -186,24 +186,20 @@ func.func @condBranchDynamicTypeNested(
 //  CHECK-NEXT: [[OWN:%.+]] = arith.select [[ARG0]], [[ARG0]], [[NOT_ARG0]]
 //   CHECK-NOT: bufferization.dealloc
 //   CHECK-NOT: bufferization.clone
-//       CHECK: cf.cond_br{{.*}}, ^bb3, ^bb4
+//       CHECK: cf.cond_br{{.*}}, ^bb3, ^bb3
 //  CHECK-NEXT: ^bb3:
 //   CHECK-NOT: bufferization.dealloc
 //   CHECK-NOT: bufferization.clone
-//       CHECK: cf.br ^bb5([[ALLOC1]], [[OWN]]
-//  CHECK-NEXT: ^bb4:
+//       CHECK: cf.br ^bb4([[ALLOC1]], [[OWN]]
+//  CHECK-NEXT: ^bb4([[ALLOC2:%.*]]:{{.*}}, [[COND1:%.+]]:{{.*}})
 //   CHECK-NOT: bufferization.dealloc
 //   CHECK-NOT: bufferization.clone
-//       CHECK: cf.br ^bb5([[ALLOC1]], [[OWN]]
-//  CHECK-NEXT: ^bb5([[ALLOC2:%.*]]:{{.*}}, [[COND1:%.+]]:{{.*}})
-//   CHECK-NOT: bufferization.dealloc
-//   CHECK-NOT: bufferization.clone
-//       CHECK: cf.br ^bb6([[ALLOC2]], [[COND1]]
-//  CHECK-NEXT: ^bb6([[ALLOC4:%.*]]:{{.*}}, [[COND2:%.+]]:{{.*}})
+//       CHECK: cf.br ^bb5([[ALLOC2]], [[COND1]]
+//  CHECK-NEXT: ^bb5([[ALLOC4:%.*]]:{{.*}}, [[COND2:%.+]]:{{.*}})
 //  CHECK-NEXT: [[BASE:%[a-zA-Z0-9_]+]]{{.*}} = memref.extract_strided_metadata [[ALLOC4]]
 //  CHECK-NEXT: [[OWN:%.+]]:2 = bufferization.dealloc ([[BASE]] :{{.*}}) if ([[COND2]]) retain ([[ALLOC4]], [[ARG2]] :
-//       CHECK: cf.br ^bb7([[ALLOC4]], [[OWN]]#0
-//  CHECK-NEXT: ^bb7([[ALLOC5:%.*]]:{{.*}}, [[COND3:%.+]]:{{.*}})
+//       CHECK: cf.br ^bb6([[ALLOC4]], [[OWN]]#0
+//  CHECK-NEXT: ^bb6([[ALLOC5:%.*]]:{{.*}}, [[COND3:%.+]]:{{.*}})
 //       CHECK: test.copy
 //       CHECK: [[BASE:%[a-zA-Z0-9_]+]]{{.*}} = memref.extract_strided_metadata [[ALLOC5]]
 //  CHECK-NEXT: bufferization.dealloc ([[BASE]] : {{.*}}) if ([[COND3]])
diff --git a/mlir/test/Dialect/Linalg/detensorize_entry_block.mlir b/mlir/test/Dialect/Linalg/detensorize_entry_block.mlir
index 50a2d6bf532aa..d1a89226fdb58 100644
--- a/mlir/test/Dialect/Linalg/detensorize_entry_block.mlir
+++ b/mlir/test/Dialect/Linalg/detensorize_entry_block.mlir
@@ -15,7 +15,7 @@ func.func @main(%arg0: tensor<f32>) -> tensor<f32> {
 // CHECK-LABEL: @main
 // CHECK-SAME:       (%[[ARG0:.+]]: tensor<f32>) -> tensor<f32>
 // CHECK:   %[[EXTRACTED:.+]] = tensor.extract %[[ARG0]][] : tensor<f32>
-// CHECK: cf.br ^{{.*}}
-// CHECK: ^{{.*}}:
-// CHECK:   %[[ELEMENTS:.+]] = tensor.from_elements %[[EXTRACTED]] : tensor<f32>
+// CHECK: cf.br ^{{.*}}(%[[EXTRACTED]] : f32)
+// CHECK: ^{{.*}}(%[[ARG1:.+]]: f32):
+// CHECK:   %[[ELEMENTS:.+]] = tensor.from_elements %[[ARG1]] : tensor<f32>
 // CHECK:   return %[[ELEMENTS]] : tensor<f32>
diff --git a/mlir/test/Dialect/Linalg/detensorize_if.mlir b/mlir/test/Dialect/Linalg/detensorize_if.mlir
index c728ad21d2209..8d17763c04b6c 100644
--- a/mlir/test/Dialect/Linalg/detensorize_if.mlir
+++ b/mlir/test/Dialect/Linalg/detensorize_if.mlir
@@ -42,15 +42,18 @@ func.func @main() -> (tensor<i32>) attributes {} {
 }
 
 // CHECK-LABEL:  func @main()
-// CHECK-DAG:     %[[cst:.*]] = arith.constant dense<0>
-// CHECK-DAG:     arith.constant true
-// CHECK:         cf.br
-// CHECK-NEXT:   ^[[bb1:.*]]:
-// CHECK-NEXT:     cf.cond_br %{{.*}}, ^[[bb2:.*]], ^bb3
-// CHECK-NEXT:   ^[[bb2]]
-// CHECK-NEXT:     cf.br ^[[bb3:.*]]
-// CHECK-NEXT:   ^[[bb3]]
-// CHECK-NEXT:     return %[[cst]]
+// CHECK-DAG:     arith.constant 0
+// CHECK-DAG:     arith.constant 10
+// CHECK:         cf.br ^[[bb1:.*]](%{{.*}}: i32)
+// CHECK-NEXT:   ^[[bb1]](%{{.*}}: i32):
+// CHECK-NEXT:     arith.cmpi slt, %{{.*}}, %{{.*}}
+// CHECK-NEXT:     cf.cond_br %{{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^bb3(%{{.*}} : i32)
+// CHECK-NEXT:   ^[[bb2]](%{{.*}}: i32)
+// CHECK-NEXT:     arith.addi %{{.*}}, %{{.*}}
+// CHECK-NEXT:     cf.br ^[[bb3:.*]](%{{.*}} : i32)
+// CHECK-NEXT:   ^[[bb3]](%{{.*}}: i32)
+// CHECK-NEXT:     tensor.from_elements %{{.*}} : tensor<i32>
+// CHECK-NEXT:     return %{{.*}}
 // CHECK-NEXT:   }
 
 // -----
@@ -103,17 +106,20 @@ func.func @main() -> (tensor<i32>) attributes {} {
 }
 
 // CHECK-LABEL:  func @main()
-// CHECK-DAG:     %[[cst:.*]] = arith.constant dense<0>
-// CHECK-DAG:     arith.constant true
-// CHECK:         cf.br ^[[bb1:.*]]
-// CHECK-NEXT:   ^[[bb1:.*]]:
-// CHECK-NEXT:     cf.cond_br %{{.*}}, ^[[bb2:.*]], ^bb3
-// CHECK-NEXT:   ^[[bb2]]:
-// CHECK-NEXT:     cf.br ^[[bb3:.*]]
-// CHECK-NEXT:   ^[[bb3]]:
-// CHECK-NEXT:     cf.br ^[[bb4:.*]]
-// CHECK-NEXT:   ^[[bb4]]:
-// CHECK-NEXT:     return %[[cst]]
+// CHECK-DAG:     arith.constant 0
+// CHECK-DAG:     arith.constant 10
+// CHECK:         cf.br ^[[bb1:.*]](%{{.*}}: i32)
+// CHECK-NEXT:   ^[[bb1]](%{{.*}}: i32):
+// CHECK-NEXT:     arith.cmpi slt, %{{.*}}, %{{.*}}
+// CHECK-NEXT:     cf.cond_br %{{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^bb3(%{{.*}} : i32)
+// CHECK-NEXT:   ^[[bb2]](%{{.*}}: i32)
+// CHECK-NEXT:     arith.addi %{{.*}}, %{{.*}}
+// CHECK-NEXT:     cf.br ^[[bb3:.*]](%{{.*}} : i32)
+// CHECK-NEXT:   ^[[bb3]](%{{.*}}: i32)
+// CHECK-NEXT:     cf.br ^[[bb4:.*]](%{{.*}} : i32)
+// CHECK-NEXT:   ^[[bb4]](%{{.*}}: i32)
+// CHECK-NEXT:     tensor.from_elements %{{.*}} : tensor<i32>
+// CHECK-NEXT:     return %{{.*}}
 // CHECK-NEXT:   }
 
 // -----
@@ -165,13 +171,16 @@ func.func @main() -> (tensor<i32>) attributes {} {
 }
 
 // CHECK-LABEL:  func @main()
-// CHECK-DAG:     %[[cst:.*]] = arith.constant dense<10>
-// CHECK-DAG:     arith.constant true
-// CHECK:         cf.br ^[[bb1:.*]]
-// CHECK-NEXT:   ^[[bb1]]:
-// CHECK-NEXT:     cf.cond_br %{{.*}}, ^[[bb2:.*]], ^bb2
-// CHECK-NEXT:   ^[[bb2]]
-// CHECK-NEXT:     cf.br ^[[bb3:.*]]
-// CHECK-NEXT:   ^[[bb3]]
-// CHECK-NEXT:     return %[[cst]]
+// CHECK-DAG:     arith.constant 0
+// CHECK-DAG:     arith.constant 10
+// CHECK:         cf.br ^[[bb1:.*]](%{{.*}}: i32)
+// CHECK-NEXT:   ^[[bb1]](%{{.*}}: i32):
+// CHECK-NEXT:     arith.cmpi slt, %{{.*}}, %{{.*}}
+// CHECK-NEXT:     cf.cond_br %{{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^bb2(%{{.*}} : i32)
+// CHECK-NEXT:   ^[[bb2]](%{{.*}}: i32)
+// CHECK-NEXT:     arith.addi %{{.*}}, %{{.*}}
+// CHECK-NEXT:     cf.br ^[[bb3:.*]](%{{.*}} : i32)
+// CHECK-NEXT:   ^[[bb3]](%{{.*}}: i32)
+// CHECK-NEXT:     tensor.from_elements %{{.*}} : tensor<i32>
+// CHECK-NEXT:     return %{{.*}}
 // CHECK-NEXT:   }
diff --git a/mlir/test/Dialect/Linalg/detensorize_while.mlir b/mlir/test/Dialect/Linalg/detensorize_while.mlir
index 580a97d3a851b..aa30900f76a33 100644
--- a/mlir/test/Dialect/Linalg/detensorize_while.mlir
+++ b/mlir/test/Dialect/Linalg/detensorize_while.mlir
@@ -46,11 +46,11 @@ func.func @main(%farg0: tensor<i32>, %farg1: tensor<i32>) -> tensor<i32> attribu
 // DET-ALL:         cf.br ^[[bb1:.*]](%{{.*}} : i32)
 // DET-ALL:       ^[[bb1]](%{{.*}}: i32)
 // DET-ALL:         arith.cmpi slt, {{.*}}
-// DET-ALL:         cf.cond_br {{.*}}, ^[[bb2:.*]], ^[[bb3:.*]]
-// DET-ALL:       ^[[bb2]]
+// DET-ALL:         cf.cond_br {{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^[[bb3:.*]](%{{.*}} : i32)
+// DET-ALL:       ^[[bb2]](%{{.*}}: i32)
 // DET-ALL:         arith.addi {{.*}}
 // DET-ALL:         cf.br ^[[bb1]](%{{.*}} : i32)
-// DET-ALL:       ^[[bb3]]:
+// DET-ALL:       ^[[bb3]](%{{.*}}: i32)
 // DET-ALL:         tensor.from_elements {{.*}}
 // DET-ALL:         return %{{.*}} : tensor<i32>
 
@@ -62,10 +62,10 @@ func.func @main(%farg0: tensor<i32>, %farg1: tensor<i32>) -> tensor<i32> attribu
 // DET-CF:         cf.br ^[[bb1:.*]](%{{.*}} : i32)
 // DET-CF:       ^[[bb1]](%{{.*}}: i32)
 // DET-CF:         arith.cmpi slt, {{.*}}
-// DET-CF:         cf.cond_br {{.*}}, ^[[bb2:.*]], ^[[bb3:.*]]
-// DET-CF:       ^[[bb2]]:
+// DET-CF:         cf.cond_br {{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^[[bb3:.*]](%{{.*}} : i32)
+// DET-CF:       ^[[bb2]](%{{.*}}: i32)
 // DET-CF:         arith.addi {{.*}}
 // DET-CF:         cf.br ^[[bb1]](%{{.*}} : i32)
-// DET-CF:       ^[[bb3]]:
+// DET-CF:       ^[[bb3]](%{{.*}}: i32)
 // DET-CF:         tensor.from_elements %{{.*}} : tensor<i32>
 // DET-CF:         return %{{.*}} : tensor<i32>
diff --git a/mlir/test/Dialect/Linalg/detensorize_while_impure_cf.mlir b/mlir/test/Dialect/Linalg/detensorize_while_impure_cf.mlir
index 414d9b94cbf53..955c7be5ef4c8 100644
--- a/mlir/test/Dialect/Linalg/detensorize_while_impure_cf.mlir
+++ b/mlir/test/Dialect/Linalg/detensorize_while_impure_cf.mlir
@@ -74,8 +74,8 @@ func.func @main(%farg0: tensor<10xi32>, %farg1: tensor<i32>) -> tensor<i32> attr
 // DET-ALL:         } -> tensor<i32>
 // DET-ALL:         tensor.extract %{{.*}}[] : tensor<i32>
 // DET-ALL:         cmpi slt, %{{.*}}, %{{.*}} : i32
-// DET-ALL:         cf.cond_br %{{.*}}, ^[[bb2:.*]], ^[[bb3:.*]]
-// DET-ALL:       ^[[bb2]]:
+// DET-ALL:         cf.cond_br %{{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^[[bb3:.*]](%{{.*}} : i32)
+// DET-ALL:       ^[[bb2]](%{{.*}}: i32)
 // DET-ALL:         tensor.from_elements %{{.*}} : tensor<i32>
 // DET-ALL:         tensor.empty() : tensor<10xi32>
 // DET-ALL:         linalg.generic {{{.*}}} ins(%{{.*}} : tensor<i32>) outs(%{{.*}} : tensor<10xi32>) {
@@ -83,7 +83,7 @@ func.func @main(%farg0: tensor<10xi32>, %farg1: tensor<i32>) -> tensor<i32> attr
 // DET-ALL:           linalg.yield %{{.*}} : i32
 // DET-ALL:         } -> tensor<10xi32>
 // DET-ALL:         cf.br ^[[bb1]](%{{.*}} : tensor<10xi32>)
-// DET-ALL:       ^[[bb3]]
+// DET-ALL:       ^[[bb3]](%{{.*}}: i32)
 // DET-ALL:         tensor.from_elements %{{.*}} : tensor<i32>
 // DET-ALL:         return %{{.*}} : tensor<i32>
 // DET-ALL:       }
@@ -95,10 +95,10 @@ func.func @main(%farg0: tensor<10xi32>, %farg1: tensor<i32>) -> tensor<i32> attr
 // DET-CF:         %{{.*}} = linalg.generic {{{.*}}} ins(%{{.*}} : tensor<10xi32>) outs(%{{.*}} : tensor<i32>) {
 // DET-CF:         tensor.extract %{{.*}}[] : tensor<i32>
 // DET-CF:         cmpi slt, %{{.*}}, %{{.*}} : i32
-// DET-CF:         cf.cond_br %{{.*}}, ^bb2, ^bb3
-// DET-CF:       ^bb2:
+// DET-CF:         cf.cond_br %{{.*}}, ^bb2(%{{.*}} : tensor<i32>), ^bb3(%{{.*}} : tensor<i32>)
+// DET-CF:       ^bb2(%{{.*}}: tensor<i32>)
 // DET-CF:         %{{.*}} = linalg.generic {{{.*}}} ins(%{{.*}} : tensor<i32>) outs(%{{.*}} : tensor<10xi32>) {
 // DET-CF:         cf.br ^bb1(%{{.*}} : tensor<10xi32>)
-// DET-CF:       ^bb3:
+// DET-CF:       ^bb3(%{{.*}}: tensor<i32>)
 // DET-CF:         return %{{.*}} : tensor<i32>
 // DET-CF:       }
diff --git a/mlir/test/Dialect/Linalg/detensorize_while_pure_cf.mlir b/mlir/test/Dialect/Linalg/detensorize_while_pure_cf.mlir
index 913e78272db79..6d8d5fe71fca5 100644
--- a/mlir/test/Dialect/Linalg/detensorize_while_pure_cf.mlir
+++ b/mlir/test/Dialect/Linalg/detensorize_while_pure_cf.mlir
@@ -49,8 +49,8 @@ func.func @main() -> () attributes {} {
 // CHECK-NEXT:    cf.br ^[[bb1:.*]](%{{.*}} : i32)
 // CHECK-NEXT:  ^[[bb1]](%{{.*}}: i32)
 // CHECK-NEXT:    %{{.*}} = arith.cmpi slt, %{{.*}}, %{{.*}}
-// CHECK-NEXT:    cf.cond_br %{{.*}}, ^[[bb2:.*]], ^[[bb3:.*]]
-// CHECK-NEXT:  ^[[bb2]]
+// CHECK-NEXT:    cf.cond_br %{{.*}}, ^[[bb2:.*]](%{{.*}} : i32), ^[[bb3:.*]]
+// CHECK-NEXT:  ^[[bb2]](%{{.*}}: i32)
 // CHECK-NEXT:    %{{.*}} = arith.addi %{{.*}}, %{{.*}}
 // CHECK-NEXT:    cf.br ^[[bb1]](%{{.*}} : i32)
 // CHECK-NEXT:  ^[[bb3]]:
diff --git a/mlir/test/Transforms/canonicalize-block-merge.mlir b/mlir/test/Transforms/canonicalize-block-merge.mlir
index 92cfde817cf7f..3b8b1fce0575a 100644
--- a/mlir/test/Transforms/canonicalize-block-merge.mlir
+++ b/mlir/test/Transforms/canonicalize-block-merge.mlir
@@ -87,7 +87,7 @@ func.func @mismatch_operands_matching_arguments(%cond : i1, %arg0 : i32, %arg1 :
 
 // CHECK-LABEL: func @mismatch_argument_uses(
 func.func @mismatch_argument_uses(%cond : i1, %arg0 : i32, %arg1 : i32) -> (i32, i32) {
-  // CHECK: return {{.*}}, {{.*}}
+  // CHECK: cf.cond_br %{{.*}}, ^bb1(%{{.*}}), ^bb2
 
   cf.cond_br %cond, ^bb1(%arg1 : i32), ^bb2(%arg0 : i32)
 
@@ -101,7 +101,7 @@ func.func @mismatch_argument_uses(%cond : i1, %arg0 : i32, %arg1 : i32) -> (i32,
 
 // CHECK-LABEL: func @mismatch_argument_types(
 func.func @mismatch_argument_types(%cond : i1, %arg0 : i32, %arg1 : i16) {
-  // CHECK: cf.cond_br %{{.*}}, ^bb1, ^bb2
+  // CHECK: cf.cond_br %{{.*}}, ^bb1(%{{.*}}), ^bb2
 
   cf.cond_br %cond, ^bb1(%arg0 : i32), ^bb2(%arg1 : i16)
 
@@ -115,7 +115,7 @@ func.func @mismatch_argument_types(%cond : i1, %arg0 : i32, %arg1 : i16) {
 
 // CHECK-LABEL: func @mismatch_argument_count(
 func.func @mismatch_argument_count(%cond : i1, %arg0 : i32) {
-  // CHECK: cf.cond_br %{{.*}}, ^bb1, ^bb2
+  // CHECK: cf.cond_br %{{.*}}, ^bb1(%{{.*}}), ^bb2
 
   cf.cond_br %cond, ^bb1(%arg0 : i32), ^bb2
 
diff --git a/mlir/test/Transforms/canonicalize-dce.mlir b/mlir/test/Transforms/canonicalize-dce.mlir
index 84631947970de..ac034d567a26a 100644
--- a/mlir/test/Transforms/canonicalize-dce.mlir
+++ b/mlir/test/Transforms/canonicalize-dce.mlir
@@ -137,10 +137,10 @@ func.func @f(%arg0: f32) {
 // Test case: Test the mechanics of deleting multiple block arguments.
 
 // CHECK:      func @f(%arg0: tensor<1xf32>, %arg1: tensor<2xf32>, %arg2: tensor<3xf32>, %arg3: tensor<4xf32>, %arg4: tensor<5xf32>)
-// CHECK-NEXT:   "test.br"()[^bb1]
-// CHECK-NEXT: ^bb1:
-// CHECK-NEXT:   "foo.print"(%arg1)
-// CHECK-NEXT:   "foo.print"(%arg3)
+// CHECK-NEXT:   "test.br"(%arg1, %arg3)[^bb1] : (tensor<2xf32>, tensor<4xf32>)
+// CHECK-NEXT: ^bb1([[VAL0:%.+]]: tensor<2xf32>, [[VAL1:%.+]]: tensor<4xf32>):
+// CHECK-NEXT:   "foo.print"([[VAL0]])
+// CHECK-NEXT:   "foo.print"([[VAL1]])
 // CHECK-NEXT:   return
 
 
diff --git a/mlir/test/Transforms/make-isolated-from-above.mlir b/mlir/test/Transforms/make-isolated-from-above.mlir
index a9d4325944fd9..58f6cfbc5dd65 100644
--- a/mlir/test/Transforms/make-isolated-from-above.mlir
+++ b/mlir/test/Transforms/make-isolated-from-above.mlir
@@ -78,9 +78,9 @@ func.func @make_isolated_from_above_multiple_blocks(%arg0 : index, %arg1 : index
 //   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[EMPTY]], %[[C1]]
 //       CHECK:   test.isolated_one_region_op %[[ARG2]], %[[C0]], %[[C1]], %[[D0]], %[[D1]]
 //  CHECK-NEXT:     ^bb0(%[[B0:[a-zA-Z0-9]+]]: index, %[[B1:[a-zA-Z0-9]+]]: index, %[[B2:[a-zA-Z0-9]+]]: index, %[[B3:[a-zA-Z0-9]+]]: index, %[[B4:[a-zA-Z0-9]+]]: index)
-//  CHECK-NEXT:       cf.br ^bb1
-//       CHECK:     ^bb1:
-//       CHECK:       "foo.yield"(%[[B1]], %[[B2]], %[[B3]], %[[B4]], %[[B0]])
+//  CHECK-NEXT:       cf.br ^bb1(%[[B0]] : index)
+//       CHECK:     ^bb1(%[[B5:.+]]: index)
+//       CHECK:       "foo.yield"(%[[B1]], %[[B2]], %[[B3]], %[[B4]], %[[B5]])
 
 // CLONE1-LABEL: func @make_isolated_from_above_multiple_blocks(
 //  CLONE1-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index
@@ -95,9 +95,9 @@ func.func @make_isolated_from_above_multiple_blocks(%arg0 : index, %arg1 : index
 //  CLONE1-NEXT:     ^bb0(%[[B0:[a-zA-Z0-9]+]]: index, %[[B1:[a-zA-Z0-9]+]]: index, %[[B2:[a-zA-Z0-9]+]]: index)
 //   CLONE1-DAG:       %[[C0_0:.+]] = arith.constant 0 : index
 //   CLONE1-DAG:       %[[C1_0:.+]] = arith.constant 1 : index
-//  CLONE1-NEXT:       cf.br ^bb1
-//       CLONE1:     ^bb1:
-//       CLONE1:       "foo.yield"(%[[C0_0]], %[[C1_0]], %[[B1]], %[[B2]], %[[B0]])
+//  CLONE1-NEXT:       cf.br ^bb1(%[[B0]] : index)
+//       CLONE1:     ^bb1(%[[B3:.+]]: index)
+//       CLONE1:       "foo.yield"(%[[C0_0]], %[[C1_0]], %[[B1]], %[[B2]], %[[B3]])
 
 // CLONE2-LABEL: func @make_isolated_from_above_multiple_blocks(
 //  CLONE2-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index
@@ -110,6 +110,6 @@ func.func @make_isolated_from_above_multiple_blocks(%arg0 : index, %arg1 : index
 //   CLONE2-DAG:       %[[EMPTY:.+]] = tensor.empty(%[[B1]], %[[B2]])
 //   CLONE2-DAG:       %[[D0:.+]] = tensor.dim %[[EMPTY]], %[[C0]]
 //   CLONE2-DAG:       %[[D1:.+]] = tensor.dim %[[EMPTY]], %[[C1]]
-//  CLONE2-NEXT:       cf.br ^bb1
-//       CLONE2:     ^bb1:
-//       CLONE2:       "foo.yield"(%[[C0]], %[[C1]], %[[D0]], %[[D1]], %[[B0]])
+//  CLONE2-NEXT:       cf.br ^bb1(%[[B0]] : index)
+//       CLONE2:     ^bb1(%[[B3:.+]]: index)
+//       CLONE2:       "foo.yield"(%[[C0]], %[[C1]], %[[D0]], %[[D1]], %[[B3]])
diff --git a/mlir/test/Transforms/test-canonicalize-merge-large-blocks.mlir b/mlir/test/Transforms/test-canonicalize-merge-large-blocks.mlir
deleted file mode 100644
index e821dcd0c2064..0000000000000
--- a/mlir/test/Transforms/test-canonicalize-merge-large-blocks.mlir
+++ /dev/null
@@ -1,162 +0,0 @@
- // RUN: mlir-opt -pass-pipeline='builtin.module(llvm.func(canonicalize{region-simplify=aggressive}))' %s | FileCheck %s
-
-llvm.func @foo(%arg0: i64)
-
-llvm.func @rand() -> i1
-
-// CHECK-LABEL: func @large_merge_block(
-llvm.func @large_merge_block(%arg0: i64) {
-  // CHECK:  %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK:  %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64
-  // CHECK:  %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64
-  // CHECK:  %[[C3:.*]] = llvm.mlir.constant(3 : i64) : i64
-  // CHECK:  %[[C4:.*]] = llvm.mlir.constant(4 : i64) : i64
-
-  // CHECK:  llvm.cond_br %5, ^bb1(%[[C1]], %[[C3]], %[[C4]], %[[C2]] : i64, i64, i64, i64), ^bb1(%[[C4]], %[[C2]], %[[C1]], %[[C3]] : i64, i64, i64, i64)
-  // CHECK: ^bb{{.*}}(%[[arg0:.*]]: i64, %[[arg1:.*]]: i64, %[[arg2:.*]]: i64, %[[arg3:.*]]: i64):
-  // CHECK:    llvm.cond_br %{{.*}}, ^bb2(%[[arg0]] : i64), ^bb2(%[[arg3]] : i64)
-  // CHECK: ^bb{{.*}}(%11: i64):
-  // CHECK:    llvm.br ^bb{{.*}}
-  // CHECK: ^bb{{.*}}:
-  // CHECK:   llvm.call
-  // CHECK:   llvm.cond_br {{.*}}, ^bb{{.*}}(%[[arg1]] : i64), ^bb{{.*}}(%[[arg2]] : i64)
-  // CHECK: ^bb{{.*}}:
-  // CHECK:   llvm.call
-  // CHECK    llvm.br ^bb{{.*}}
-
-  %0 = llvm.mlir.constant(0 : i64) : i64
-  %1 = llvm.mlir.constant(1 : i64) : i64
-  %2 = llvm.mlir.constant(2 : i64) : i64
-  %3 = llvm.mlir.constant(3 : i64) : i64
-  %4 = llvm.mlir.constant(4 : i64) : i64
-  %10 = llvm.icmp "eq" %arg0, %0 : i64
-  llvm.cond_br %10, ^bb1, ^bb14
-^bb1:  // pred: ^bb0
-  %11 = llvm.call @rand() : () -> i1
-  llvm.cond_br %11, ^bb2, ^bb3
-^bb2:  // pred: ^bb1
-  llvm.call @foo(%1) : (i64) -> ()
-  llvm.br ^bb4
-^bb3:  // pred: ^bb1
-  llvm.call @foo(%2) : (i64) -> ()
-  llvm.br ^bb4
-^bb4:  // 2 preds: ^bb2, ^bb3
-  %14 = llvm.call @rand() : () -> i1
-  llvm.cond_br %14, ^bb5, ^bb6
-^bb5:  // pred: ^bb4
-  llvm.call @foo(%3) : (i64) -> ()
-  llvm.br ^bb13
-^bb6:  // pred: ^bb4
-  llvm.call @foo(%4) : (i64) -> ()
-  llvm.br ^bb13
-^bb13:  // 2 preds: ^bb11, ^bb12
-  llvm.br ^bb27
-^bb14:  // pred: ^bb0
-  %23 = llvm.call @rand() : () -> i1
-  llvm.cond_br %23, ^bb15, ^bb16
-^bb15:  // pred: ^bb14
-  llvm.call @foo(%4) : (i64) -> ()
-  llvm.br ^bb17
-^bb16:  // pred: ^bb14
-  llvm.call @foo(%3) : (i64) -> ()
-  llvm.br ^bb17
-^bb17:  // 2 preds: ^bb15, ^bb16
-  %26 = llvm.call @rand() : () -> i1
-  llvm.cond_br %26, ^bb18, ^bb19
-^bb18:  // pred: ^bb17
-  llvm.call @foo(%2) : (i64) -> ()
-  llvm.br ^bb26
-^bb19:  // pred: ^bb17
-  llvm.call @foo(%1) : (i64) -> ()
-  llvm.br ^bb26
-^bb26:  // 2 preds: ^bb24, ^bb25
-  llvm.br ^bb27
-^bb27:  // 2 preds: ^bb13, ^bb26
-  llvm.return
-}
-
-llvm.func @redundant_args0(%cond : i1) {
-  %0 = llvm.mlir.constant(0 : i64) : i64
-  %2 = llvm.mlir.constant(1 : i64) : i64
-  %3 = llvm.mlir.constant(2 : i64) : i64
-  // CHECK  %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK  %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64
-  // CHECK  %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64
-
-  llvm.cond_br %cond, ^bb1, ^bb2
-
-  // CHECK: llvm.cond_br %{{.*}}, ^bb{{.*}}(%[[C0]], %[[C0]] : i64, i64), ^bb{{.*}}(%[[C1]], %[[C2]] : i64, i64)
-  // CHECK: ^bb{{.*}}(%{{.*}}: i64, %{{.*}}: i64)
-^bb1:
-  llvm.call @foo(%0) : (i64) -> ()
-  llvm.call @foo(%0) : (i64) -> ()
-  llvm.br ^bb3
-^bb2:
-  llvm.call @foo(%2) : (i64) -> ()
-  llvm.call @foo(%3) : (i64) -> ()
-  llvm.br ^bb3
-^bb3:
-  llvm.return
-}
-
-llvm.func @redundant_args1(%cond : i1) {
-  %0 = llvm.mlir.constant(0 : i64) : i64
-  %2 = llvm.mlir.constant(1 : i64) : i64
-  %3 = llvm.mlir.constant(2 : i64) : i64
-  // CHECK  %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK  %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64
-  // CHECK  %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64
-
-  llvm.cond_br %cond, ^bb1, ^bb2
-
-  // CHECK: llvm.cond_br %{{.*}}, ^bb{{.*}}(%[[C1]], %[[C2]] : i64, i64), ^bb{{.*}}(%[[C0]], %[[C0]] : i64, i64)
-  // CHECK: ^bb{{.*}}(%{{.*}}: i64, %{{.*}}: i64)
-^bb1:
-  llvm.call @foo(%2) : (i64) -> ()
-  llvm.call @foo(%3) : (i64) -> ()
-  llvm.br ^bb3
-^bb2:
-  llvm.call @foo(%0) : (i64) -> ()
-  llvm.call @foo(%0) : (i64) -> ()
-  llvm.br ^bb3
-^bb3:
-  llvm.return
-}
-
-llvm.func @redundant_args_complex(%cond : i1) {
-  %0 = llvm.mlir.constant(0 : i64) : i64
-  %1 = llvm.mlir.constant(1 : i64) : i64
-  %2 = llvm.mlir.constant(2 : i64) : i64
-  %3 = llvm.mlir.constant(3 : i64) : i64
-  // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64
-  // CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64
-  // CHECK: %[[C3:.*]] = llvm.mlir.constant(3 : i64) : i64
-
-  llvm.cond_br %cond, ^bb1, ^bb2
-
-  // CHECK: llvm.cond_br %{{.*}}, ^bb{{.*}}(%[[C2]], %[[C1]], %[[C3]] : i64, i64, i64), ^bb{{.*}}(%[[C0]], %[[C3]], %[[C2]] : i64, i64, i64)
-  // CHECK: ^bb{{.*}}(%[[arg0:.*]]: i64, %[[arg1:.*]]: i64, %[[arg2:.*]]: i64):
-  // CHECK: llvm.call @foo(%[[arg0]])
-  // CHECK: llvm.call @foo(%[[arg0]])
-  // CHECK: llvm.call @foo(%[[arg1]])
-  // CHECK: llvm.call @foo(%[[C2]])
-  // CHECK: llvm.call @foo(%[[arg2]])
-
-^bb1:
-  llvm.call @foo(%2) : (i64) -> ()
-  llvm.call @foo(%2) : (i64) -> ()
-  llvm.call @foo(%1) : (i64) -> ()
-  llvm.call @foo(%2) : (i64) -> ()
-  llvm.call @foo(%3) : (i64) -> ()
-  llvm.br ^bb3
-^bb2:
-  llvm.call @foo(%0) : (i64) -> ()
-  llvm.call @foo(%0) : (i64) -> ()
-  llvm.call @foo(%3) : (i64) -> ()
-  llvm.call @foo(%2) : (i64) -> ()
-  llvm.call @foo(%2) : (i64) -> ()
-  llvm.br ^bb3
-^bb3:
-  llvm.return
-}

>From d82df1b891fecae0af7de5e970b592ce37178a30 Mon Sep 17 00:00:00 2001
From: Edd Dawson <edd.dawson at sony.com>
Date: Thu, 25 Jul 2024 09:43:17 +0100
Subject: [PATCH 108/259] [PS4/PS5][Driver] Always pass LTO options to the
 linker (#100423)

The driver doesn't know if LTO will occur at link time. That's
determined by the presence or absence of LLVM bitcode objects among
those ingested by the linker.

For this reason, LTO options for codegen etc must be passed to the
linker unconditionally. If LTO does not occur, these options have no
effect.

Also simplify the way LTO options are supplied to the PS4 linker.
`-lto-debug-options` and `-lto-thin-debug-options` are combined and
routed to the same place. So, always use the former, regardless of
full/thin LTO mode.

SIE tracker: TOOLCHAIN-16575
---
 clang/lib/Driver/ToolChains/PS4CPU.cpp | 79 +++++++++++---------------
 clang/test/Driver/lto-jobs.c           |  5 +-
 clang/test/Driver/ps4-linker.c         | 18 +++---
 clang/test/Driver/ps5-linker.c         | 10 ++--
 clang/test/Driver/unified-lto.c        | 23 +++++++-
 5 files changed, 70 insertions(+), 65 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index d6af9388e54a6..813a0fbedd2b1 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -152,48 +152,36 @@ void tools::PS4cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(Output.getFilename());
   }
 
-  const bool UseLTO = D.isUsingLTO();
   const bool UseJMC =
       Args.hasFlag(options::OPT_fjmc, options::OPT_fno_jmc, false);
 
   const char *LTOArgs = "";
-  auto AddCodeGenFlag = [&](Twine Flag) {
+  auto AddLTOFlag = [&](Twine Flag) {
     LTOArgs = Args.MakeArgString(Twine(LTOArgs) + " " + Flag);
   };
 
-  if (UseLTO) {
-    // This tells LTO to perform JustMyCode instrumentation.
-    if (UseJMC)
-      AddCodeGenFlag("-enable-jmc-instrument");
+  // If the linker sees bitcode objects it will perform LTO. We can't tell
+  // whether or not that will be the case at this point. So, unconditionally
+  // pass LTO options to ensure proper codegen, metadata production, etc if
+  // LTO indeed occurs.
+  if (Args.hasFlag(options::OPT_funified_lto, options::OPT_fno_unified_lto,
+                   true))
+    CmdArgs.push_back(D.getLTOMode() == LTOK_Thin ? "--lto=thin"
+                                                  : "--lto=full");
+  if (UseJMC)
+    AddLTOFlag("-enable-jmc-instrument");
 
-    if (Arg *A = Args.getLastArg(options::OPT_fcrash_diagnostics_dir))
-      AddCodeGenFlag(Twine("-crash-diagnostics-dir=") + A->getValue());
+  if (Arg *A = Args.getLastArg(options::OPT_fcrash_diagnostics_dir))
+    AddLTOFlag(Twine("-crash-diagnostics-dir=") + A->getValue());
 
-    StringRef Parallelism = getLTOParallelism(Args, D);
-    if (!Parallelism.empty())
-      AddCodeGenFlag(Twine("-threads=") + Parallelism);
+  if (StringRef Threads = getLTOParallelism(Args, D); !Threads.empty())
+    AddLTOFlag(Twine("-threads=") + Threads);
 
-    const char *Prefix = nullptr;
-    if (D.getLTOMode() == LTOK_Thin)
-      Prefix = "-lto-thin-debug-options=";
-    else if (D.getLTOMode() == LTOK_Full)
-      Prefix = "-lto-debug-options=";
-    else
-      llvm_unreachable("new LTO mode?");
-
-    CmdArgs.push_back(Args.MakeArgString(Twine(Prefix) + LTOArgs));
-  }
+  CmdArgs.push_back(Args.MakeArgString(Twine("-lto-debug-options=") + LTOArgs));
 
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs))
     TC.addSanitizerArgs(Args, CmdArgs, "-l", "");
 
-  if (D.isUsingLTO() && Args.hasArg(options::OPT_funified_lto)) {
-    if (D.getLTOMode() == LTOK_Thin)
-      CmdArgs.push_back("--lto=thin");
-    else if (D.getLTOMode() == LTOK_Full)
-      CmdArgs.push_back("--lto=full");
-  }
-
   Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_T_Group,
                             options::OPT_s, options::OPT_t});
 
@@ -259,37 +247,34 @@ void tools::PS5cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(Output.getFilename());
   }
 
-  const bool UseLTO = D.isUsingLTO();
   const bool UseJMC =
       Args.hasFlag(options::OPT_fjmc, options::OPT_fno_jmc, false);
 
-  auto AddCodeGenFlag = [&](Twine Flag) {
+  auto AddLTOFlag = [&](Twine Flag) {
     CmdArgs.push_back(Args.MakeArgString(Twine("-plugin-opt=") + Flag));
   };
 
-  if (UseLTO) {
-    // This tells LTO to perform JustMyCode instrumentation.
-    if (UseJMC)
-      AddCodeGenFlag("-enable-jmc-instrument");
+  // If the linker sees bitcode objects it will perform LTO. We can't tell
+  // whether or not that will be the case at this point. So, unconditionally
+  // pass LTO options to ensure proper codegen, metadata production, etc if
+  // LTO indeed occurs.
+  if (Args.hasFlag(options::OPT_funified_lto, options::OPT_fno_unified_lto,
+                   true))
+    CmdArgs.push_back(D.getLTOMode() == LTOK_Thin ? "--lto=thin"
+                                                  : "--lto=full");
 
-    if (Arg *A = Args.getLastArg(options::OPT_fcrash_diagnostics_dir))
-      AddCodeGenFlag(Twine("-crash-diagnostics-dir=") + A->getValue());
+  if (UseJMC)
+    AddLTOFlag("-enable-jmc-instrument");
 
-    StringRef Parallelism = getLTOParallelism(Args, D);
-    if (!Parallelism.empty())
-      CmdArgs.push_back(Args.MakeArgString(Twine("-plugin-opt=jobs=") + Parallelism));
-  }
+  if (Arg *A = Args.getLastArg(options::OPT_fcrash_diagnostics_dir))
+    AddLTOFlag(Twine("-crash-diagnostics-dir=") + A->getValue());
+
+  if (StringRef Jobs = getLTOParallelism(Args, D); !Jobs.empty())
+    AddLTOFlag(Twine("jobs=") + Jobs);
 
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs))
     TC.addSanitizerArgs(Args, CmdArgs, "-l", "");
 
-  if (D.isUsingLTO() && Args.hasArg(options::OPT_funified_lto)) {
-    if (D.getLTOMode() == LTOK_Thin)
-      CmdArgs.push_back("--lto=thin");
-    else if (D.getLTOMode() == LTOK_Full)
-      CmdArgs.push_back("--lto=full");
-  }
-
   Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_T_Group,
                             options::OPT_s, options::OPT_t});
 
diff --git a/clang/test/Driver/lto-jobs.c b/clang/test/Driver/lto-jobs.c
index b4f109e4c502c..2c7ca02ea4779 100644
--- a/clang/test/Driver/lto-jobs.c
+++ b/clang/test/Driver/lto-jobs.c
@@ -6,12 +6,15 @@
 // RUN: %clang --target=x86_64-sie-ps5 -### %s -flto=thin -flto-jobs=5 2> %t
 // RUN: FileCheck -check-prefix=CHECK-LINK-THIN-JOBS-ACTION < %t %s
 //
+// RUN: %clang --target=x86_64-sie-ps5 -### %s -flto-jobs=5 2> %t
+// RUN: FileCheck -check-prefix=CHECK-LINK-THIN-JOBS-ACTION < %t %s
+//
 // CHECK-LINK-THIN-JOBS-ACTION: "-plugin-opt=jobs=5"
 //
 // RUN: %clang --target=x86_64-scei-ps4 -### %s -flto=thin -flto-jobs=5 2> %t
 // RUN: FileCheck -check-prefix=CHECK-PS4-LINK-THIN-JOBS-ACTION < %t %s
 //
-// CHECK-PS4-LINK-THIN-JOBS-ACTION: "-lto-thin-debug-options= -threads=5"
+// CHECK-PS4-LINK-THIN-JOBS-ACTION: "-lto-debug-options= -threads=5"
 
 // RUN: %clang --target=x86_64-apple-darwin13.3.0 -### %s -flto=thin -flto-jobs=5 2> %t
 // RUN: FileCheck -check-prefix=CHECK-LINK-THIN-JOBS2-ACTION < %t %s
diff --git a/clang/test/Driver/ps4-linker.c b/clang/test/Driver/ps4-linker.c
index be989cdd7d5b1..449da3040e758 100644
--- a/clang/test/Driver/ps4-linker.c
+++ b/clang/test/Driver/ps4-linker.c
@@ -1,20 +1,18 @@
 // Test the driver's control over the JustMyCode behavior with linker flags.
 
-// RUN: %clang --target=x86_64-scei-ps4 -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK,CHECK-LIB %s
-// RUN: %clang --target=x86_64-scei-ps4 -flto=thin -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-THIN-LTO,CHECK-LIB %s
-// RUN: %clang --target=x86_64-scei-ps4 -flto=full -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-FULL-LTO,CHECK-LIB %s
+// RUN: %clang --target=x86_64-scei-ps4 -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LTO,CHECK-LIB %s
+// RUN: %clang --target=x86_64-scei-ps4 -flto=thin -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LTO,CHECK-LIB %s
+// RUN: %clang --target=x86_64-scei-ps4 -flto=full -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LTO,CHECK-LIB %s
 
-// CHECK-NOT: -enable-jmc-instrument
-// CHECK-THIN-LTO: "-lto-thin-debug-options= -enable-jmc-instrument"
-// CHECK-FULL-LTO: "-lto-debug-options= -enable-jmc-instrument"
+// CHECK-LTO: "-lto-debug-options= -enable-jmc-instrument"
 
 // Check the default library name.
 // CHECK-LIB: "--whole-archive" "-lSceDbgJmc" "--no-whole-archive"
 
 // Test the driver's control over the -fcrash-diagnostics-dir behavior with linker flags.
 
-// RUN: %clang --target=x86_64-scei-ps4 -flto=thin -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-THIN-LTO %s
-// RUN: %clang --target=x86_64-scei-ps4 -flto=full -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-FULL-LTO %s
+// RUN: %clang --target=x86_64-scei-ps4 -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-LTO %s
+// RUN: %clang --target=x86_64-scei-ps4 -flto=thin -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-LTO %s
+// RUN: %clang --target=x86_64-scei-ps4 -flto=full -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-LTO %s
 
-// CHECK-DIAG-THIN-LTO: "-lto-thin-debug-options= -crash-diagnostics-dir=mydumps"
-// CHECK-DIAG-FULL-LTO: "-lto-debug-options= -crash-diagnostics-dir=mydumps"
+// CHECK-DIAG-LTO: "-lto-debug-options= -crash-diagnostics-dir=mydumps"
diff --git a/clang/test/Driver/ps5-linker.c b/clang/test/Driver/ps5-linker.c
index 9f1e3a273b2db..cf39d5bae97ac 100644
--- a/clang/test/Driver/ps5-linker.c
+++ b/clang/test/Driver/ps5-linker.c
@@ -1,10 +1,9 @@
 // Test the driver's control over the JustMyCode behavior with linker flags.
 
 // RUN: %clang --target=x86_64-scei-ps5 -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK,CHECK-LIB %s
-// RUN: %clang --target=x86_64-scei-ps5 -flto -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LTO,CHECK-LIB %s
+// RUN: %clang --target=x86_64-scei-ps5 -flto -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK,CHECK-LIB %s
 
-// CHECK-NOT: -plugin-opt=-enable-jmc-instrument
-// CHECK-LTO: -plugin-opt=-enable-jmc-instrument
+// CHECK: -plugin-opt=-enable-jmc-instrument
 
 // Check the default library name.
 // CHECK-LIB: "--whole-archive" "-lSceJmc_nosubmission" "--no-whole-archive"
@@ -12,7 +11,6 @@
 // Test the driver's control over the -fcrash-diagnostics-dir behavior with linker flags.
 
 // RUN: %clang --target=x86_64-scei-ps5 -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG %s
-// RUN: %clang --target=x86_64-scei-ps5 -flto -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-LTO %s
+// RUN: %clang --target=x86_64-scei-ps5 -flto -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG %s
 
-// CHECK-DIAG-NOT: -plugin-opt=-crash-diagnostics-dir=mydumps
-// CHECK-DIAG-LTO: -plugin-opt=-crash-diagnostics-dir=mydumps
+// CHECK-DIAG: -plugin-opt=-crash-diagnostics-dir=mydumps
diff --git a/clang/test/Driver/unified-lto.c b/clang/test/Driver/unified-lto.c
index 3a6fe44f5b32d..490aaca59939d 100644
--- a/clang/test/Driver/unified-lto.c
+++ b/clang/test/Driver/unified-lto.c
@@ -7,6 +7,27 @@
 // NOUNIT-NOT: "-flto-unit"
 
 // RUN: %clang --target=x86_64-sie-ps5 -### %s -funified-lto 2>&1 | FileCheck --check-prefix=NOUNILTO %s
-// NOUNILTO: clang: warning: argument unused during compilation: '-funified-lto'
 // NOUNILTO: "-cc1"
 // NOUNILTO-NOT: "-funified-lto
+
+// On PlayStation -funified-lto is the default. `-flto(=...)` influences the
+// `--lto=...` option passed to linker, unless `-fno-unified-lto` is supplied.
+// PS4:
+// RUN: %clang --target=x86_64-sie-ps4 -### %s 2>&1 | FileCheck --check-prefixes=LD,LTOFULL %s
+// RUN: %clang --target=x86_64-sie-ps4 -### %s -flto 2>&1 | FileCheck --check-prefixes=LD,LTOFULL %s
+// RUN: %clang --target=x86_64-sie-ps4 -### %s -flto=full 2>&1 | FileCheck --check-prefixes=LD,LTOFULL %s
+// RUN: %clang --target=x86_64-sie-ps4 -### %s -flto=thin 2>&1 | FileCheck --check-prefixes=LD,LTOTHIN %s
+// RUN: %clang --target=x86_64-sie-ps4 -### %s -fno-unified-lto -flto=full 2>&1 | FileCheck --check-prefixes=LD,NOLTO %s
+// RUN: %clang --target=x86_64-sie-ps4 -### %s -fno-unified-lto -flto=thin 2>&1 | FileCheck --check-prefixes=LD,NOLTO %s
+// PS5:
+// RUN: %clang --target=x86_64-sie-ps5 -### %s 2>&1 | FileCheck --check-prefixes=LD,LTOFULL %s
+// RUN: %clang --target=x86_64-sie-ps5 -### %s -flto 2>&1 | FileCheck --check-prefixes=LD,LTOFULL %s
+// RUN: %clang --target=x86_64-sie-ps5 -### %s -flto=full 2>&1 | FileCheck --check-prefixes=LD,LTOFULL %s
+// RUN: %clang --target=x86_64-sie-ps5 -### %s -flto=thin 2>&1 | FileCheck --check-prefixes=LD,LTOTHIN %s
+// RUN: %clang --target=x86_64-sie-ps5 -### %s -fno-unified-lto -flto=full 2>&1 | FileCheck --check-prefixes=LD,NOLTO %s
+// RUN: %clang --target=x86_64-sie-ps5 -### %s -fno-unified-lto -flto=thin 2>&1 | FileCheck --check-prefixes=LD,NOLTO %s
+
+// LD: {{.*ld}}"
+// LTOFULL-SAME: "--lto=full"
+// LTOTHIN-SAME: "--lto=thin"
+// NOLTO-NOT: "--lto

>From c7a3346ab6a8fbd551a80bd4028ec8624daa35e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski at arm.com>
Date: Thu, 25 Jul 2024 09:44:34 +0100
Subject: [PATCH 109/259] [mlir][linalg] Fix scalable vectorisation of
 tensor.extract (#100325)

This PR fixes one very specific aspect of vectorising `tensor.extract`
Ops when targeting scalable vectors. Namely, it makes sure that the
scalable flag is correctly propagated when creating
`vector::ShapeCastOp`.

BEFORE:
```mlir
vector.shape_cast %idx_vec : vector<1x1x[4]xindex> to vector<4xindex>
```

AFTER:
```mlir
vector.shape_cast %idx_vec : vector<1x1x[4]xindex> to vector<[4]xindex>
```

This particular ShapeCastOp is created when generating an index for
`vector.transfer_read` operations. Strictly speaking, casting is not
really required. However, it makes the subsequent address calculation
much simpler (*).

The following test is updated to demonstrate the use of
`vector.shape_cast` by the vectoriser:
*
@masked_static_vectorize_nd_tensor_extract_with_affine_apply_contiguous

Similar test with scalable vectors is also added.

(*) At this point in the vectoriser it is known
that all leading dims in the index vector are "1").
---
 .../Linalg/Transforms/Vectorization.cpp       |   9 +-
 .../vectorize-tensor-extract-masked.mlir      | 111 ++++++++++++++++--
 2 files changed, 103 insertions(+), 17 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index c4dab7d061b4b..9185663799e52 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1077,19 +1077,20 @@ vectorizeTensorExtract(RewriterBase &rewriter, VectorizationState &state,
   //   * for vector indices (e.g. `vector<1x1x4xindex>`) - extract the bottom
   //    (0th) element and use that.
   SmallVector<Value> transferReadIdxs;
-  auto resTrailingDim = resultType.getShape().back();
   auto zero = rewriter.create<arith::ConstantOp>(
       loc, rewriter.getI32Type(), rewriter.getZeroAttr(rewriter.getI32Type()));
   for (size_t i = 0; i < extractOp.getIndices().size(); i++) {
-    auto idx = bvm.lookup(extractOp.getIndices()[i]);
+    Value idx = bvm.lookup(extractOp.getIndices()[i]);
     if (idx.getType().isIndex()) {
       transferReadIdxs.push_back(idx);
       continue;
     }
 
     auto indexAs1dVector = rewriter.create<vector::ShapeCastOp>(
-        loc, VectorType::get({resTrailingDim}, rewriter.getIndexType()),
-        bvm.lookup(extractOp.getIndices()[i]));
+        loc,
+        VectorType::get(resultType.getShape().back(), rewriter.getIndexType(),
+                        resultType.getScalableDims().back()),
+        idx);
     transferReadIdxs.push_back(
         rewriter.create<vector::ExtractElementOp>(loc, indexAs1dVector, zero));
   }
diff --git a/mlir/test/Dialect/Linalg/vectorize-tensor-extract-masked.mlir b/mlir/test/Dialect/Linalg/vectorize-tensor-extract-masked.mlir
index f042753780013..964565620fd01 100644
--- a/mlir/test/Dialect/Linalg/vectorize-tensor-extract-masked.mlir
+++ b/mlir/test/Dialect/Linalg/vectorize-tensor-extract-masked.mlir
@@ -1,29 +1,52 @@
 // RUN: mlir-opt %s -transform-interpreter -split-input-file | FileCheck %s
 
-func.func @masked_static_vectorize_nd_tensor_extract_with_affine_apply_contiguous(%6: tensor<80x16xf32>, %arg0: index, %extracted_slice : tensor<1x3xf32>) -> tensor<1x3xf32> {
+func.func @masked_static_vectorize_nd_tensor_extract_with_affine_apply_contiguous(
+    %src: tensor<80x16xf32>,
+    %output : tensor<1x3xf32>,
+    %idx: index) -> tensor<1x3xf32> {
+
   %c79 = arith.constant 79 : index
   %1 = linalg.generic {
     indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
     iterator_types = ["parallel", "parallel"]
-  } outs(%extracted_slice : tensor<1x3xf32>) {
+  } outs(%output : tensor<1x3xf32>) {
   ^bb0(%out: f32):
     %2 = linalg.index 1 : index
-    %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %arg0)
-    %extracted = tensor.extract %6[%c79, %3] : tensor<80x16xf32>
+    %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %idx)
+    %extracted = tensor.extract %src[%c79, %3] : tensor<80x16xf32>
     linalg.yield %extracted : f32
   } -> tensor<1x3xf32>
   return %1 : tensor<1x3xf32>
 }
 
 // CHECK-LABEL:   func.func @masked_static_vectorize_nd_tensor_extract_with_affine_apply_contiguous
-// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 3 : index
-// CHECK:           %[[VAL_8:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_5]] : vector<1x4xi1>
-// CHECK:           %[[VAL_9:.*]] = vector.mask %[[VAL_8]] { vector.transfer_read {{.*}} {in_bounds = [true, true]} : tensor<1x3xf32>, vector<1x4xf32> } : vector<1x4xi1> -> vector<1x4xf32>
-// CHECK:           %[[VAL_11:.*]] = vector.broadcast {{.*}} : index to vector<4xindex>
-// CHECK:           %[[VAL_12:.*]] = arith.addi {{.*}} : vector<4xindex>
-// CHECK:           %[[VAL_20:.*]] = vector.mask %[[VAL_8]] { vector.transfer_read {{.*}} {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32> } : vector<1x4xi1> -> vector<1x4xf32>
-// CHECK:           %[[VAL_22:.*]] = vector.mask %[[VAL_8]] { vector.transfer_write {{.*}} {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x3xf32> } : vector<1x4xi1> -> tensor<1x3xf32>
+// CHECK-SAME:      %[[SRC:.*]]: tensor<80x16xf32>,
+// CHECK-SAME:      %[[OUTPUT:.*]]: tensor<1x3xf32>,
+// CHECK-SAME:      %[[IDX_IN:.*]]: index) -> tensor<1x3xf32> {
+
+/// Create the mask
+// CHECK-DAG:       %[[DIM_0:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[DIM_1:.*]] = arith.constant 3 : index
+// CHECK-DAG:       %[[C79:.*]] = arith.constant 79 : index
+// CHECK:           %[[MASK:.*]] = vector.create_mask %[[DIM_0]], %[[DIM_1]] : vector<1x4xi1>
+
+/// TODO: This transfer_read is redundant - remove
+// CHECK:           vector.mask %[[MASK]] { vector.transfer_read {{.*}} {in_bounds = [true, true]} : tensor<1x3xf32>, vector<1x4xf32> } : vector<1x4xi1> -> vector<1x4xf32>
+
+/// Caluclate the index vector
+// CHECK:           %[[STEP:.*]] = vector.step : vector<4xindex>
+// CHECK:           %[[IDX_BC:.*]] = vector.broadcast %[[IDX_IN]] : index to vector<4xindex>
+// CHECK:           %[[IDX_VEC:.*]] = arith.addi %[[STEP]], %[[IDX_BC]] : vector<4xindex>
+// CHECK:           %[[C0:.*]] = arith.constant 0 : i32
+// CHECK:           %[[SC:.*]] = vector.shape_cast %[[IDX_VEC]] : vector<4xindex> to vector<4xindex>
+
+/// Extract the starting point from the index vector
+// CHECK:           %[[IDX_START:.*]] = vector.extractelement %[[SC]]{{\[}}%[[C0]] : i32] : vector<4xindex>
+
+// Final read and write
+// CHECK:           %[[READ:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[SRC]]{{\[}}%[[C79]], %[[IDX_START]]], {{.*}} {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32> } : vector<1x4xi1> -> vector<1x4xf32>
+// CHECK:           %[[C0_1:.*]] = arith.constant 0 : index
+// CHECK:           vector.mask %[[MASK]] { vector.transfer_write %[[READ]], %[[OUTPUT]]{{\[}}%[[C0_1]], %[[C0_1]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x3xf32> } : vector<1x4xi1> -> tensor<1x3xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
@@ -33,7 +56,69 @@ module attributes {transform.with_named_sequence} {
    }
 }
 
- // -----
+// -----
+
+// Identical to the above, but with scalable vectors.
+
+func.func @masked_static_vectorize_nd_tensor_extract_with_affine_apply_contiguous_scalable(
+    %src: tensor<80x16xf32>,
+    %output : tensor<1x3xf32>,
+    %idx: index) -> tensor<1x3xf32> {
+
+  %c79 = arith.constant 79 : index
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } outs(%output : tensor<1x3xf32>) {
+  ^bb0(%out: f32):
+    %2 = linalg.index 1 : index
+    %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %idx)
+    %extracted = tensor.extract %src[%c79, %3] : tensor<80x16xf32>
+    linalg.yield %extracted : f32
+  } -> tensor<1x3xf32>
+
+  return %1 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL:   func.func @masked_static_vectorize_nd_tensor_extract_with_affine_apply_contiguous_scalable
+// CHECK-SAME:      %[[SRC:.*]]: tensor<80x16xf32>,
+// CHECK-SAME:      %[[OUTPUT:.*]]: tensor<1x3xf32>,
+// CHECK-SAME:      %[[IDX_IN:.*]]: index) -> tensor<1x3xf32> {
+
+/// Create the mask
+// CHECK-DAG:       %[[DIM_0:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[DIM_1:.*]] = arith.constant 3 : index
+// CHECK-DAG:       %[[C79:.*]] = arith.constant 79 : index
+// CHECK:           %[[MASK:.*]] = vector.create_mask %[[DIM_0]], %[[DIM_1]] : vector<1x[4]xi1>
+
+/// TODO: This transfer_read is redundant - remove
+// CHECK:           vector.mask %[[MASK]] { vector.transfer_read {{.*}} {in_bounds = [true, true]} : tensor<1x3xf32>, vector<1x[4]xf32> } : vector<1x[4]xi1> -> vector<1x[4]xf32>
+
+/// Caluclate the index vector
+// CHECK:           %[[STEP:.*]] = vector.step : vector<[4]xindex>
+// CHECK:           %[[IDX_BC:.*]] = vector.broadcast %[[IDX_IN]] : index to vector<[4]xindex>
+// CHECK:           %[[IDX_VEC:.*]] = arith.addi %[[STEP]], %[[IDX_BC]] : vector<[4]xindex>
+// CHECK:           %[[C0:.*]] = arith.constant 0 : i32
+// CHECK:           %[[SC:.*]] = vector.shape_cast %[[IDX_VEC]] : vector<[4]xindex> to vector<[4]xindex>
+
+/// Extract the starting point from the index vector
+// CHECK:           %[[IDX_START:.*]] = vector.extractelement %[[SC]]{{\[}}%[[C0]] : i32] : vector<[4]xindex>
+
+// Final read and write
+// CHECK:           %[[READ:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[SRC]]{{\[}}%[[C79]], %[[IDX_START]]], {{.*}} {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x[4]xf32> } : vector<1x[4]xi1> -> vector<1x[4]xf32>
+// CHECK:           %[[C0_1:.*]] = arith.constant 0 : index
+// CHECK:           vector.mask %[[MASK]] { vector.transfer_write %[[READ]], %[[OUTPUT]]{{\[}}%[[C0_1]], %[[C0_1]]] {in_bounds = [true, true]} : vector<1x[4]xf32>, tensor<1x3xf32> } : vector<1x[4]xi1> -> tensor<1x3xf32>
+
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+     transform.structured.vectorize %0 vector_sizes [1, [4]] {vectorize_nd_extract} : !transform.any_op
+     transform.yield
+   }
+}
+
+// -----
 
 func.func @masked_dynamic_vectorize_nd_tensor_extract_with_affine_apply_contiguous(%6: tensor<?x?xf32>, %arg0: index, %extracted_slice : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %c79 = arith.constant 79 : index

>From 98c73d5df7ff0b5d9c10bc9d44a584d631def1e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski at arm.com>
Date: Thu, 25 Jul 2024 09:50:25 +0100
Subject: [PATCH 110/259] [mlir][vector] Restrict vector.shape_cast (scalable
 vectors) (#100331)

Updates the verifier for `vector.shape_cast` so that incorrect cases
where "scalability" is dropped are immediately rejected. For example:
```mlir
  vector.shape_cast %vec : vector<1x1x[4]xindex> to vector<4xindex>
```

Also, as a separate PR, I've prepared a fix for the Linalg vectorizer to
avoid generating such shape casts (*):
* https://github.com/llvm/llvm-project/pull/100325

(*) Note, that's just one specific case that I've identified so far.
---
 mlir/include/mlir/IR/BuiltinTypes.td     |  5 +++++
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp | 10 ++++++++++
 mlir/test/Dialect/Vector/invalid.mlir    | 14 ++++++++++++++
 3 files changed, 29 insertions(+)

diff --git a/mlir/include/mlir/IR/BuiltinTypes.td b/mlir/include/mlir/IR/BuiltinTypes.td
index 0b3532dcc7d4f..176a167a3ca31 100644
--- a/mlir/include/mlir/IR/BuiltinTypes.td
+++ b/mlir/include/mlir/IR/BuiltinTypes.td
@@ -1168,6 +1168,11 @@ def Builtin_Vector : Builtin_Type<"Vector", "vector",
       return !llvm::is_contained(getScalableDims(), false);
     }
 
+    /// Get the number of scalable dimensions.
+    int64_t getNumScalableDims() const {
+      return llvm::count(getScalableDims(), true);
+    }
+
     /// Get or create a new VectorType with the same shape as `this` and an
     /// element type of bitwidth scaled by `scale`.
     /// Return null if the scaled element type cannot be represented.
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index df3a59ed80ad4..d297c40760cd8 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -5238,6 +5238,16 @@ static LogicalResult verifyVectorShapeCast(Operation *op,
     if (!isValidShapeCast(resultShape, sourceShape))
       return op->emitOpError("invalid shape cast");
   }
+
+  // Check that (non-)scalability is preserved
+  int64_t sourceNScalableDims = sourceVectorType.getNumScalableDims();
+  int64_t resultNScalableDims = resultVectorType.getNumScalableDims();
+  if (sourceNScalableDims != resultNScalableDims)
+    return op->emitOpError("different number of scalable dims at source (")
+           << sourceNScalableDims << ") and result (" << resultNScalableDims
+           << ")";
+  sourceVectorType.getNumDynamicDims();
+
   return success();
 }
 
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index 208982a3e0e7b..00914c1d1baf6 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -1182,6 +1182,20 @@ func.func @shape_cast_invalid_rank_expansion(%arg0 : vector<15x2xf32>) {
 
 // -----
 
+func.func @shape_cast_scalability_flag_is_dropped(%arg0 : vector<15x[2]xf32>) {
+  // expected-error at +1 {{different number of scalable dims at source (1) and result (0)}}
+  %0 = vector.shape_cast %arg0 : vector<15x[2]xf32> to vector<30xf32>
+}
+
+// -----
+
+func.func @shape_cast_scalability_flag_is_dropped(%arg0 : vector<2x[15]x[2]xf32>) {
+  // expected-error at +1 {{different number of scalable dims at source (2) and result (1)}}
+  %0 = vector.shape_cast %arg0 : vector<2x[15]x[2]xf32> to vector<30x[2]xf32>
+}
+
+// -----
+
 func.func @bitcast_not_vector(%arg0 : vector<5x1x3x2xf32>) {
   // expected-error at +1 {{'vector.bitcast' invalid kind of type specified}}
   %0 = vector.bitcast %arg0 : vector<5x1x3x2xf32> to f32

>From 4ca1a901dc6ca1926a594f57e6c4997ea9d185a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= <kadircet at google.com>
Date: Thu, 25 Jul 2024 11:21:18 +0200
Subject: [PATCH 111/259] [clang][CUDA] Assume unknown emission status for
 skipped function definitions (#100124)

Emission status seems to be only used by cuda/openmp/hip compiles, to
figure out
when to emit diagnostics. Current logic emits "uknown" when definition
is
missing, so i extended that to skipped-function-bodies as well.
---
 clang/lib/Sema/SemaDecl.cpp             | 6 ++++--
 clang/unittests/Tooling/ToolingTest.cpp | 5 +++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index bb25a0b3a45ae..575bd292f27de 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -20148,8 +20148,10 @@ Sema::FunctionEmissionStatus Sema::getEmissionStatus(const FunctionDecl *FD,
     // be emitted, because (say) the definition could include "inline".
     const FunctionDecl *Def = FD->getDefinition();
 
-    return Def && !isDiscardableGVALinkage(
-                      getASTContext().GetGVALinkageForFunction(Def));
+    // We can't compute linkage when we skip function bodies.
+    return Def && !Def->hasSkippedBody() &&
+           !isDiscardableGVALinkage(
+               getASTContext().GetGVALinkageForFunction(Def));
   };
 
   if (LangOpts.OpenMPIsTargetDevice) {
diff --git a/clang/unittests/Tooling/ToolingTest.cpp b/clang/unittests/Tooling/ToolingTest.cpp
index 2e3da2cd2a701..f41a44fa0922a 100644
--- a/clang/unittests/Tooling/ToolingTest.cpp
+++ b/clang/unittests/Tooling/ToolingTest.cpp
@@ -586,6 +586,11 @@ TEST(runToolOnCode, TestSkipFunctionBody) {
   EXPECT_FALSE(runToolOnCodeWithArgs(
       std::make_unique<SkipBodyAction>(),
       "template<typename T> int skipMeNot() { an_error_here }", Args2));
+
+  EXPECT_TRUE(runToolOnCodeWithArgs(
+      std::make_unique<SkipBodyAction>(),
+      "__inline __attribute__((__gnu_inline__)) void skipMe() {}",
+      {"--cuda-host-only", "-nocudainc", "-xcuda"}));
 }
 
 TEST(runToolOnCodeWithArgs, TestNoDepFile) {

>From f87e9d42c70476b63f4c87e52651f653f7e2f036 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett at linaro.org>
Date: Thu, 25 Jul 2024 10:24:51 +0100
Subject: [PATCH 112/259] [libcxx][test] Update picolibc version (#100348)

This is the latest as of today, and it fixes one of the xfails.

Since
https://github.com/picolibc/picolibc/commit/5e4d0c80f49b4efe8be85e36b5e85d65bd343214,
TIME_UTC is defined.
---
 .../support.runtime/ctime.timespec.compile.pass.cpp      | 3 ---
 libcxx/utils/ci/build-picolibc.sh                        | 9 ++-------
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/libcxx/test/std/language.support/support.runtime/ctime.timespec.compile.pass.cpp b/libcxx/test/std/language.support/support.runtime/ctime.timespec.compile.pass.cpp
index b7fd892003e2d..194262bb42e3b 100644
--- a/libcxx/test/std/language.support/support.runtime/ctime.timespec.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.runtime/ctime.timespec.compile.pass.cpp
@@ -11,9 +11,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// picolibc doesn't define TIME_UTC.
-// XFAIL: LIBCXX-PICOLIBC-FIXME
-
 // ::timespec_get is provided by the C library, but it's marked as
 // unavailable until macOS 10.15
 // XFAIL: target={{.+}}-apple-macosx10.{{13|14}}
diff --git a/libcxx/utils/ci/build-picolibc.sh b/libcxx/utils/ci/build-picolibc.sh
index 713e277354768..400e4dcab99a3 100755
--- a/libcxx/utils/ci/build-picolibc.sh
+++ b/libcxx/utils/ci/build-picolibc.sh
@@ -69,13 +69,8 @@ picolibc_source_dir="${build_dir}/picolibc-source"
 picolibc_build_dir="${build_dir}/picolibc-build"
 mkdir -p "${picolibc_source_dir}"
 mkdir -p "${picolibc_build_dir}"
-# Download the version of picolibc that was the latest at the time this script was written.
-# Following changes are required and were introduced after version 1.8.5:
-# - updated semihost arguments handling,
-# - added missing macros in stdio.h
-# - external linkage for isblank
-# Version following 1.8.5, was not released by the time of writing.
-picolibc_commit="04a90c56d7aac61880f205ec29b3dce6a9de0342"
+# Download a known good version of picolibc.
+picolibc_commit="48fbc2009c6473293d03d5ec6f190565c6223a5c"
 curl -L "https://github.com/picolibc/picolibc/archive/${picolibc_commit}.zip" --output "${picolibc_source_dir}/picolibc.zip"
 unzip -q "${picolibc_source_dir}/picolibc.zip" -d "${picolibc_source_dir}"
 mv "${picolibc_source_dir}/picolibc-${picolibc_commit}"/* "${picolibc_source_dir}"

>From 456c5121f8e6f37881f236bdf573ef29cad5af1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= <kadircet at google.com>
Date: Thu, 25 Jul 2024 11:27:14 +0200
Subject: [PATCH 113/259] [clang][Driver] Add cc1 flag for setting
 SkipFunctionBodies (#100135)

This is an option set by certain tools (clangd and ASTUnit). Sometimes
there are crashes in clang, unique to this configuration and it's really
hard to provide reproducers without invoking the tool.
---
 clang/include/clang/Driver/Options.td        |  3 +++
 clang/test/Frontend/skip-function-bodies.cpp | 13 +++++++++++++
 2 files changed, 16 insertions(+)
 create mode 100644 clang/test/Frontend/skip-function-bodies.cpp

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 28a5cb71d2219..26811bf948ae5 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -7534,6 +7534,9 @@ def code_completion_brief_comments : Flag<["-"], "code-completion-brief-comments
 def code_completion_with_fixits : Flag<["-"], "code-completion-with-fixits">,
   HelpText<"Include code completion results which require small fix-its.">,
   MarshallingInfoFlag<FrontendOpts<"CodeCompleteOpts.IncludeFixIts">>;
+def skip_function_bodies : Flag<["-"], "skip-function-bodies">,
+  HelpText<"Skip function bodies when possible">,
+  MarshallingInfoFlag<FrontendOpts<"SkipFunctionBodies">>;
 def disable_free : Flag<["-"], "disable-free">,
   HelpText<"Disable freeing of memory on exit">,
   MarshallingInfoFlag<FrontendOpts<"DisableFree">>;
diff --git a/clang/test/Frontend/skip-function-bodies.cpp b/clang/test/Frontend/skip-function-bodies.cpp
new file mode 100644
index 0000000000000..d0593b474bda2
--- /dev/null
+++ b/clang/test/Frontend/skip-function-bodies.cpp
@@ -0,0 +1,13 @@
+// Trivial check to ensure skip-function-bodies flag is propagated.
+//
+// RUN: %clang_cc1 -verify -skip-function-bodies -pedantic-errors %s
+// expected-no-diagnostics
+
+int f() {
+  // normally this should emit some diags, but we're skipping it!
+  this is garbage;
+}
+
+// Make sure we only accept it as a cc1 arg.
+// RUN: not %clang -skip-function-bodies %s 2>&1 | FileCheck %s
+// CHECK: clang: error: unknown argument '-skip-function-bodies'; did you mean '-Xclang -skip-function-bodies'?

>From 74e964c7c17cae6fd523bc93b3644893853989fc Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 25 Jul 2024 10:55:07 +0100
Subject: [PATCH 114/259] [AMDGPU] Add GFX12 test coverage for export
 instructions (#100415)

---
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll   | 177 +++++++++---------
 .../CodeGen/AMDGPU/llvm.amdgcn.exp.prim.ll    |   3 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll     |  67 +++++++
 3 files changed, 158 insertions(+), 89 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
index 67e4feed21fac..c120c58ce6f33 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
@@ -1,14 +1,15 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX8,PREGFX11 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX10,PREGFX11 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX11 %s
 
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
 declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1) #1
 declare float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #2
 
 ; GCN-LABEL: {{^}}test_export_zeroes_f32:
-; GCN: exp mrt0 off, off, off, off{{$}}
-; GCN: exp mrt0 off, off, off, off done{{$}}
+; GCN: {{exp|export}} mrt0 off, off, off, off{{$}}
+; GCN: {{exp|export}} mrt0 off, off, off, off done{{$}}
 define amdgpu_kernel void @test_export_zeroes_f32() #0 {
 
   call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
@@ -23,7 +24,7 @@ define amdgpu_kernel void @test_export_zeroes_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], off, off, off done{{$}}
 define amdgpu_kernel void @test_export_en_src0_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -34,7 +35,7 @@ define amdgpu_kernel void @test_export_en_src0_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}}
+; GCN: {{exp|export}} mrt0 off, [[SRC1]], off, off done{{$}}
 define amdgpu_kernel void @test_export_en_src1_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -45,7 +46,7 @@ define amdgpu_kernel void @test_export_en_src1_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}}
+; GCN: {{exp|export}} mrt0 off, off, [[SRC2]], off done{{$}}
 define amdgpu_kernel void @test_export_en_src2_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -56,7 +57,7 @@ define amdgpu_kernel void @test_export_en_src2_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}}
+; GCN: {{exp|export}} mrt0 off, off, off, [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_en_src3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -67,7 +68,7 @@ define amdgpu_kernel void @test_export_en_src3_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
 define amdgpu_kernel void @test_export_en_src0_src1_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -78,7 +79,7 @@ define amdgpu_kernel void @test_export_en_src0_src1_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
 define amdgpu_kernel void @test_export_en_src0_src2_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -89,8 +90,8 @@ define amdgpu_kernel void @test_export_en_src0_src2_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
-; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_en_src0_src3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -102,8 +103,8 @@ define amdgpu_kernel void @test_export_en_src0_src3_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -112,8 +113,8 @@ define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_export_mrt7_f32:
 ; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0.5
-; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
-; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
+; GCN: {{exp|export}} mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
+; GCN: {{exp|export}} mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
 define amdgpu_kernel void @test_export_mrt7_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 7, i32 15, float 0.5, float 0.5, float 0.5, float 0.5, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 7, i32 15, float 0.5, float 0.5, float 0.5, float 0.5, i1 true, i1 false)
@@ -125,8 +126,8 @@ define amdgpu_kernel void @test_export_mrt7_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_z_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 8, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 8, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -138,8 +139,8 @@ define amdgpu_kernel void @test_export_z_f32() #0 {
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; PREGFX11: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; PREGFX11: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; PREGFX11: {{exp|export}} null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; PREGFX11: {{exp|export}} null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_null_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 9, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 9, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -151,8 +152,8 @@ define amdgpu_kernel void @test_export_null_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_reserved10_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 10, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 10, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -164,8 +165,8 @@ define amdgpu_kernel void @test_export_reserved10_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_reserved11_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 11, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 11, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -177,8 +178,8 @@ define amdgpu_kernel void @test_export_reserved11_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_pos0_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -190,8 +191,8 @@ define amdgpu_kernel void @test_export_pos0_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_pos3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 15, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 15, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -203,8 +204,8 @@ define amdgpu_kernel void @test_export_pos3_f32() #0 {
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; PREGFX11: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; PREGFX11: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; PREGFX11: {{exp|export}} param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; PREGFX11: {{exp|export}} param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_param0_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -216,8 +217,8 @@ define amdgpu_kernel void @test_export_param0_f32() #0 {
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; PREGFX11: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; PREGFX11: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; PREGFX11: {{exp|export}} param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; PREGFX11: {{exp|export}} param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_param31_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
@@ -229,10 +230,10 @@ define amdgpu_kernel void @test_export_param31_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; PREGFX11: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
-; PREGFX11: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
-; GFX11: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GFX11: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; PREGFX11: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
+; PREGFX11: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
+; GFX11: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GFX11: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_vm_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 true)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 true)
@@ -254,8 +255,8 @@ define amdgpu_kernel void @test_export_vm_f32() #0 {
 
 
 ; GCN-LABEL: {{^}}test_export_zeroes_i32:
-; GCN: exp mrt0 off, off, off, off{{$}}
-; GCN: exp mrt0 off, off, off, off done{{$}}
+; GCN: {{exp|export}} mrt0 off, off, off, off{{$}}
+; GCN: {{exp|export}} mrt0 off, off, off, off done{{$}}
 define amdgpu_kernel void @test_export_zeroes_i32() #0 {
 
   call void @llvm.amdgcn.exp.i32(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
@@ -270,7 +271,7 @@ define amdgpu_kernel void @test_export_zeroes_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], off, off, off done{{$}}
 define amdgpu_kernel void @test_export_en_src0_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 1, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -281,7 +282,7 @@ define amdgpu_kernel void @test_export_en_src0_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}}
+; GCN: {{exp|export}} mrt0 off, [[SRC1]], off, off done{{$}}
 define amdgpu_kernel void @test_export_en_src1_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 2, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -292,7 +293,7 @@ define amdgpu_kernel void @test_export_en_src1_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}}
+; GCN: {{exp|export}} mrt0 off, off, [[SRC2]], off done{{$}}
 define amdgpu_kernel void @test_export_en_src2_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 4, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -303,7 +304,7 @@ define amdgpu_kernel void @test_export_en_src2_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}}
+; GCN: {{exp|export}} mrt0 off, off, off, [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_en_src3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 8, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -314,7 +315,7 @@ define amdgpu_kernel void @test_export_en_src3_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
 define amdgpu_kernel void @test_export_en_src0_src1_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 3, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -325,7 +326,7 @@ define amdgpu_kernel void @test_export_en_src0_src1_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
 define amdgpu_kernel void @test_export_en_src0_src2_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 5, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -336,8 +337,8 @@ define amdgpu_kernel void @test_export_en_src0_src2_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
-; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_en_src0_src3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 9, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 0, i32 9, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -349,8 +350,8 @@ define amdgpu_kernel void @test_export_en_src0_src3_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -359,8 +360,8 @@ define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_i32() #0 {
 
 ; GCN-LABEL: {{^}}test_export_mrt7_i32:
 ; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 5
-; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
-; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
+; GCN: {{exp|export}} mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
+; GCN: {{exp|export}} mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
 define amdgpu_kernel void @test_export_mrt7_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 7, i32 15, i32 5, i32 5, i32 5, i32 5, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 7, i32 15, i32 5, i32 5, i32 5, i32 5, i1 true, i1 false)
@@ -372,8 +373,8 @@ define amdgpu_kernel void @test_export_mrt7_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_z_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 8, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 8, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -385,8 +386,8 @@ define amdgpu_kernel void @test_export_z_i32() #0 {
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; PREGFX11: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; PREGFX11: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; PREGFX11: {{exp|export}} null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; PREGFX11: {{exp|export}} null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_null_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 9, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 9, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -398,8 +399,8 @@ define amdgpu_kernel void @test_export_null_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_reserved10_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 10, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 10, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -411,8 +412,8 @@ define amdgpu_kernel void @test_export_reserved10_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_reserved11_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 11, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 11, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -424,8 +425,8 @@ define amdgpu_kernel void @test_export_reserved11_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_pos0_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 12, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 12, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -437,8 +438,8 @@ define amdgpu_kernel void @test_export_pos0_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; GCN: {{exp|export}} pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: {{exp|export}} pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_pos3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 15, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 15, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -450,8 +451,8 @@ define amdgpu_kernel void @test_export_pos3_i32() #0 {
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; PREGFX11: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; PREGFX11: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; PREGFX11: {{exp|export}} param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; PREGFX11: {{exp|export}} param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_param0_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 32, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 32, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -463,8 +464,8 @@ define amdgpu_kernel void @test_export_param0_i32() #0 {
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; PREGFX11-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; PREGFX11: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; PREGFX11: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; PREGFX11: {{exp|export}} param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; PREGFX11: {{exp|export}} param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_param31_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 63, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 63, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
@@ -476,10 +477,10 @@ define amdgpu_kernel void @test_export_param31_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
-; PREGFX11: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
-; PREGFX11: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
-; GFX11: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GFX11: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+; PREGFX11: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
+; PREGFX11: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
+; GFX11: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GFX11: {{exp|export}} mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
 define amdgpu_kernel void @test_export_vm_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 true)
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 true)
@@ -554,15 +555,15 @@ end:
 ; GFX8-DAG: v_mov_b32_e32 [[Y:v[0-9]+]], s0
 ; GFX8-DAG: v_add_f32_e{{32|64}} [[Z0:v[0-9]+]]
 ; GFX8-DAG: v_sub_f32_e{{32|64}} [[Z1:v[0-9]+]]
-; GFX8: exp param0 [[Y]], [[X]], [[Z0]], [[W0]]{{$}}
-; GFX8-NEXT: exp param1 [[Y]], [[X]], [[Z1]], [[W1]] done{{$}}
+; GFX8: {{exp|export}} param0 [[Y]], [[X]], [[Z0]], [[W0]]{{$}}
+; GFX8-NEXT: {{exp|export}} param1 [[Y]], [[X]], [[Z1]], [[W1]] done{{$}}
 
 ; GFX10-DAG: v_mov_b32_e32 [[X:v[0-9]+]], s0
 ; GFX10-DAG: v_mov_b32_e32 [[Y:v[0-9]+]], s1
 ; GFX10-DAG: v_add_f32_e{{32|64}} [[Z0:v[0-9]+]]
 ; GFX10-DAG: v_sub_f32_e{{32|64}} [[Z1:v[0-9]+]]
-; GFX10: exp param0 [[X]], [[Y]], [[Z0]], [[W0]]{{$}}
-; GFX10-NEXT: exp param1 [[X]], [[Y]], [[Z1]], [[W1]] done{{$}}
+; GFX10: {{exp|export}} param0 [[X]], [[Y]], [[Z0]], [[W0]]{{$}}
+; GFX10-NEXT: {{exp|export}} param1 [[X]], [[Y]], [[Z1]], [[W1]] done{{$}}
 define amdgpu_kernel void @test_export_clustering(float %x, float %y) #0 {
   %z0 = fadd float %x, %y
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %x, float %y, float %z0, float 0.0, i1 false, i1 false)
@@ -572,9 +573,9 @@ define amdgpu_kernel void @test_export_clustering(float %x, float %y) #0 {
 }
 
 ; GCN-LABEL: {{^}}test_export_pos_before_param:
-; PREGFX11: exp pos0
+; PREGFX11: {{exp|export}} pos0
 ; PREGFX11-NOT: s_waitcnt
-; PREGFX11: exp param0
+; PREGFX11: {{exp|export}} param0
 define amdgpu_kernel void @test_export_pos_before_param(float %x, float %y) #0 {
   %z0 = fadd float %x, %y
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float %z0, i1 false, i1 false)
@@ -584,9 +585,9 @@ define amdgpu_kernel void @test_export_pos_before_param(float %x, float %y) #0 {
 }
 
 ; GCN-LABEL: {{^}}test_export_pos4_before_param:
-; GFX10: exp pos4
+; GFX10: {{exp|export}} pos4
 ; GFX10-NOT: s_waitcnt
-; GFX10: exp param0
+; GFX10: {{exp|export}} param0
 define amdgpu_kernel void @test_export_pos4_before_param(float %x, float %y) #0 {
   %z0 = fadd float %x, %y
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float %z0, i1 false, i1 false)
@@ -596,13 +597,13 @@ define amdgpu_kernel void @test_export_pos4_before_param(float %x, float %y) #0
 }
 
 ; GCN-LABEL: {{^}}test_export_pos_before_param_ordered:
-; PREGFX11: exp pos0
-; PREGFX11: exp pos1
-; PREGFX11: exp pos2
+; PREGFX11: {{exp|export}} pos0
+; PREGFX11: {{exp|export}} pos1
+; PREGFX11: {{exp|export}} pos2
 ; PREGFX11-NOT: s_waitcnt
-; PREGFX11: exp param0
-; PREGFX11: exp param1
-; PREGFX11: exp param2
+; PREGFX11: {{exp|export}} param0
+; PREGFX11: {{exp|export}} param1
+; PREGFX11: {{exp|export}} param2
 define amdgpu_kernel void @test_export_pos_before_param_ordered(float %x, float %y) #0 {
   %z0 = fadd float %x, %y
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float %z0, i1 false, i1 false)
@@ -616,9 +617,9 @@ define amdgpu_kernel void @test_export_pos_before_param_ordered(float %x, float
 }
 
 ; GCN-LABEL: {{^}}test_export_pos_before_param_across_load:
-; PREGFX11: exp pos0
-; PREGFX11-NEXT: exp param0
-; PREGFX11-NEXT: exp param1
+; PREGFX11: {{exp|export}} pos0
+; PREGFX11-NEXT: {{exp|export}} param0
+; PREGFX11-NEXT: {{exp|export}} param1
 define amdgpu_kernel void @test_export_pos_before_param_across_load(i32 %idx) #0 {
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float 1.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float 1.0, float 1.0, float 1.0, float 0.5, i1 false, i1 false)
@@ -630,9 +631,9 @@ define amdgpu_kernel void @test_export_pos_before_param_across_load(i32 %idx) #0
 ; GCN-LABEL: {{^}}test_export_across_store_load:
 ; PREGFX11: buffer_store
 ; PREGFX11: buffer_load
-; PREGFX11: exp pos0
-; PREGFX11: exp param0
-; PREGFX11: exp param1
+; PREGFX11: {{exp|export}} pos0
+; PREGFX11: {{exp|export}} param0
+; PREGFX11: {{exp|export}} param1
 define amdgpu_kernel void @test_export_across_store_load(i32 %idx, float %v) #0 {
   %data0 = alloca <4 x float>, align 8, addrspace(5)
   %data1 = alloca <4 x float>, align 8, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.prim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.prim.ll
index 52441bcb82f5c..429ee21598384 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.prim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.prim.ll
@@ -1,12 +1,13 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=NOPRIM %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=PRIM %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=PRIM %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=PRIM %s
 
 declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1) #1
 
 ; GCN-LABEL: {{^}}test_export_prim_i32:
 ; NOPRIM: exp invalid_target_20 v0, off, off, off done{{$}}
-; PRIM: exp prim v0, off, off, off done{{$}}
+; PRIM: {{exp|export}} prim v0, off, off, off done{{$}}
 define amdgpu_gs void @test_export_prim_i32(i32 inreg %a) #0 {
   call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %a, i32 undef, i32 undef, i32 undef, i1 true, i1 false)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
index 18c711d0b2aec..c62a8882c4245 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL
 
 declare void @llvm.amdgcn.exp.row.i32(i32, i32, i32, i32, i32, i32, i1, i32)
 declare void @llvm.amdgcn.exp.row.f32(i32, i32, float, float, float, float, i1, i32)
@@ -13,6 +15,13 @@ define amdgpu_kernel void @undef_i32() #0 {
 ; GFX11-NEXT:    exp pos0 off, off, off, off row_en
 ; GFX11-NEXT:    exp pos1 off, off, off, off done row_en
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: undef_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b32 m0, 0
+; GFX12-NEXT:    export pos0 off, off, off, off row_en
+; GFX12-NEXT:    export pos1 off, off, off, off done row_en
+; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.exp.row.i32(i32 12, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i1 false, i32 0)
   call void @llvm.amdgcn.exp.row.i32(i32 13, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i1 true, i32 0)
   ret void
@@ -25,6 +34,13 @@ define amdgpu_kernel void @undef_f32() #0 {
 ; GFX11-NEXT:    exp pos0 off, off, off, off row_en
 ; GFX11-NEXT:    exp pos1 off, off, off, off done row_en
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: undef_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b32 m0, 0
+; GFX12-NEXT:    export pos0 off, off, off, off row_en
+; GFX12-NEXT:    export pos1 off, off, off, off done row_en
+; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.exp.row.f32(i32 12, i32 0, float undef, float undef, float undef, float undef, i1 false, i32 0)
   call void @llvm.amdgcn.exp.row.f32(i32 13, i32 0, float undef, float undef, float undef, float undef, i1 true, i32 0)
   ret void
@@ -38,6 +54,14 @@ define amdgpu_kernel void @zero_i32() #0 {
 ; GFX11-NEXT:    exp pos0 v0, v0, v0, off row_en
 ; GFX11-NEXT:    exp pos1 v0, v0, v0, off done row_en
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: zero_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_mov_b32 m0, 0
+; GFX12-NEXT:    export pos0 v0, v0, v0, off row_en
+; GFX12-NEXT:    export pos1 v0, v0, v0, off done row_en
+; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.exp.row.i32(i32 12, i32 7, i32 0, i32 0, i32 0, i32 undef, i1 false, i32 0)
   call void @llvm.amdgcn.exp.row.i32(i32 13, i32 7, i32 0, i32 0, i32 0, i32 undef, i1 true, i32 0)
   ret void
@@ -51,6 +75,14 @@ define amdgpu_kernel void @one_f32() #0 {
 ; GFX11-NEXT:    exp pos0 v0, v0, v0, off row_en
 ; GFX11-NEXT:    exp pos1 v0, v0, v0, off done row_en
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: one_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX12-NEXT:    s_mov_b32 m0, 0
+; GFX12-NEXT:    export pos0 v0, v0, v0, off row_en
+; GFX12-NEXT:    export pos1 v0, v0, v0, off done row_en
+; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.exp.row.f32(i32 12, i32 7, float 1.0, float 1.0, float 1.0, float undef, i1 false, i32 0)
   call void @llvm.amdgcn.exp.row.f32(i32 13, i32 7, float 1.0, float 1.0, float 1.0, float undef, i1 true, i32 0)
   ret void
@@ -63,6 +95,13 @@ define amdgpu_kernel void @id_i32() #0 {
 ; GFX11-NEXT:    s_mov_b32 m0, 0
 ; GFX11-NEXT:    exp pos0 v0, off, off, off done row_en
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: id_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_mov_b32 m0, 0
+; GFX12-NEXT:    export pos0 v0, off, off, off done row_en
+; GFX12-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   call void @llvm.amdgcn.exp.row.i32(i32 12, i32 1, i32 %id, i32 undef, i32 undef, i32 undef, i1 true, i32 0)
   ret void
@@ -77,6 +116,15 @@ define amdgpu_kernel void @id_arg_i32(i32 %row) #0 {
 ; GFX11-NEXT:    s_mov_b32 m0, s0
 ; GFX11-NEXT:    exp pos0 v0, off, off, off done row_en
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: id_arg_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x24
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 m0, s0
+; GFX12-NEXT:    export pos0 v0, off, off, off done row_en
+; GFX12-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   call void @llvm.amdgcn.exp.row.i32(i32 12, i32 1, i32 %id, i32 undef, i32 undef, i32 undef, i1 true, i32 %row)
   ret void
@@ -102,6 +150,25 @@ define amdgpu_kernel void @id_row_i32() #0 {
 ; GFX11-GISEL-NEXT:    v_readfirstlane_b32 m0, v0
 ; GFX11-GISEL-NEXT:    exp pos0 v1, off, off, off done row_en
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: id_row_i32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0x63
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX12-SDAG-NEXT:    export pos0 v0, off, off, off done row_en
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: id_row_i32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0x63
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 m0, v0
+; GFX12-GISEL-NEXT:    export pos0 v1, off, off, off done row_en
+; GFX12-GISEL-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   call void @llvm.amdgcn.exp.row.i32(i32 12, i32 1, i32 99, i32 undef, i32 undef, i32 undef, i1 true, i32 %id)
   ret void

>From e24dc34aa085b9e8d3ea58cc5f59f80bc4c7cdb4 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 25 Jul 2024 14:03:28 +0400
Subject: [PATCH 115/259] AMDGPU: Fix asserting in DAG kernel argument lowering
 on v6i32 (#100528)

Remove this pointless assertion for the number of vector elements.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  4 --
 .../AMDGPU/kernel-argument-dag-lowering.ll    | 42 +++++++++++++++++++
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index ec386000302f1..2ad91de566323 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1245,10 +1245,6 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
 
       // Round up vec3/vec5 argument.
       if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
-        assert(MemVT.getVectorNumElements() == 3 ||
-               MemVT.getVectorNumElements() == 5 ||
-               (MemVT.getVectorNumElements() >= 9 &&
-                MemVT.getVectorNumElements() <= 12));
         MemVT = MemVT.getPow2VectorType(State.getContext());
       } else if (!MemVT.isSimple() && !MemVT.isVector()) {
         MemVT = MemVT.getRoundIntegerType(State.getContext());
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
index f74f9a8f2bdd8..1a0aa09e2e40a 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
@@ -27,6 +27,48 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}v5i8_arg:
+; GCN: s_load_dwordx2 s[0:1], s[8:9], 0x0
+define amdgpu_kernel void @v5i8_arg(<5 x i8> %in) nounwind {
+  store <5 x i8> %in, ptr addrspace(1) null
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v6i8_arg:
+; GCN: s_load_dwordx2 s[0:1], s[8:9], 0x0
+define amdgpu_kernel void @v6i8_arg(<6 x i8> %in) nounwind {
+  store <6 x i8> %in, ptr addrspace(1) null
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v5i16_arg:
+; GCN: s_load_dwordx4 s[0:3], s[8:9], 0x0
+define amdgpu_kernel void @v5i16_arg(<5 x i16> %in) nounwind {
+  store <5 x i16> %in, ptr addrspace(1) null
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v6i16_arg:
+; GCN-DAG: s_load_dwordx4 s[0:3], s[8:9], 0x0
+define amdgpu_kernel void @v6i16_arg(<6 x i16> %in) nounwind {
+  store <6 x i16> %in, ptr addrspace(1) null
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v5i32_arg:
+; GCN: s_load_dwordx8 s[0:7], s[8:9], 0x0
+define amdgpu_kernel void @v5i32_arg(<5 x i32> %in) nounwind {
+  store <5 x i32> %in, ptr addrspace(1) null
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v6i32_arg:
+; GCN: s_load_dwordx8 s[0:7], s[8:9], 0x0
+define amdgpu_kernel void @v6i32_arg(<6 x i32> %in) nounwind {
+  store <6 x i32> %in, ptr addrspace(1) null
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}i65_arg:
 ; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0
 

>From 6f37d42a33ba09add14a9a1f422cb489ba02336e Mon Sep 17 00:00:00 2001
From: chuongg3 <chuong.goh at arm.com>
Date: Thu, 25 Jul 2024 11:06:46 +0100
Subject: [PATCH 116/259] [AArch64][GlobalISel] Reland Make G_DUP immediate
 32-bits or larger (#96780) (#99014)

Immediate operand gets extended in RegBankSelect to at least 32 bits to
allow for better pattern matching in TableGen

The previous patch was erasing a constant without checking if it has
more than one use

Changes:
 - Does not erase the constant
 - Added @v_dup16_const test
---
 .../GISel/AArch64InstructionSelector.cpp      |  8 +-
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp | 35 ++++++++-
 .../AArch64/GlobalISel/regbank-dup.mir        | 75 +++++++++++--------
 .../CodeGen/AArch64/GlobalISel/select-dup.mir | 19 +++++
 llvm/test/CodeGen/AArch64/arm64-dup.ll        | 67 ++++++++++++-----
 llvm/test/CodeGen/AArch64/neon-mov.ll         | 73 ++++--------------
 6 files changed, 160 insertions(+), 117 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index d33b0ab7b9fcd..e9e6b6cb68d0d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -2284,8 +2284,9 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
     Register Dst = I.getOperand(0).getReg();
     auto *CV = ConstantDataVector::getSplat(
         MRI.getType(Dst).getNumElements(),
-        ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()),
-                         ValAndVReg->Value));
+        ConstantInt::get(
+            Type::getIntNTy(Ctx, MRI.getType(Dst).getScalarSizeInBits()),
+            ValAndVReg->Value.trunc(MRI.getType(Dst).getScalarSizeInBits())));
     if (!emitConstantVector(Dst, CV, MIB, MRI))
       return false;
     I.eraseFromParent();
@@ -5614,7 +5615,8 @@ AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
   }
 
   if (CV->getSplatValue()) {
-    APInt DefBits = APInt::getSplat(DstSize, CV->getUniqueInteger());
+    APInt DefBits = APInt::getSplat(
+        DstSize, CV->getUniqueInteger().trunc(DstTy.getScalarSizeInBits()));
     auto TryMOVIWithBits = [&](APInt DefBits) -> MachineInstr * {
       MachineInstr *NewOp;
       bool Inv = false;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 5616d063f70bc..220ddbf10a1c2 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -42,6 +42,7 @@
 #include "AArch64GenRegisterBankInfo.def"
 
 using namespace llvm;
+static const unsigned CustomMappingID = 1;
 
 AArch64RegisterBankInfo::AArch64RegisterBankInfo(
     const TargetRegisterInfo &TRI) {
@@ -424,6 +425,26 @@ void AArch64RegisterBankInfo::applyMappingImpl(
     MI.getOperand(2).setReg(Ext.getReg(0));
     return applyDefaultMapping(OpdMapper);
   }
+  case AArch64::G_DUP: {
+    // Extend smaller gpr to 32-bits
+    assert(MRI.getType(MI.getOperand(1).getReg()).getSizeInBits() < 32 &&
+           "Expected sources smaller than 32-bits");
+    Builder.setInsertPt(*MI.getParent(), MI.getIterator());
+
+    Register ConstReg;
+    auto ConstMI = MRI.getVRegDef(MI.getOperand(1).getReg());
+    if (ConstMI->getOpcode() == TargetOpcode::G_CONSTANT) {
+      auto CstVal = ConstMI->getOperand(1).getCImm()->getValue();
+      ConstReg =
+          Builder.buildConstant(LLT::scalar(32), CstVal.sext(32)).getReg(0);
+    } else {
+      ConstReg = Builder.buildAnyExt(LLT::scalar(32), MI.getOperand(1).getReg())
+                     .getReg(0);
+    }
+    MRI.setRegBank(ConstReg, getRegBank(AArch64::GPRRegBankID));
+    MI.getOperand(1).setReg(ConstReg);
+    return applyDefaultMapping(OpdMapper);
+  }
   default:
     llvm_unreachable("Don't know how to handle that operation");
   }
@@ -792,8 +813,14 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
              (getRegBank(ScalarReg, MRI, TRI) == &AArch64::FPRRegBank ||
               onlyDefinesFP(*ScalarDef, MRI, TRI)))
       OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
-    else
+    else {
+      if (ScalarTy.getSizeInBits() < 32 &&
+          getRegBank(ScalarReg, MRI, TRI) == &AArch64::GPRRegBank) {
+        // Calls applyMappingImpl()
+        MappingID = CustomMappingID;
+      }
       OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
+    }
     break;
   }
   case TargetOpcode::G_TRUNC: {
@@ -1014,8 +1041,10 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       // If the type is i8/i16, and the regank will be GPR, then we change the
       // type to i32 in applyMappingImpl.
       LLT Ty = MRI.getType(MI.getOperand(2).getReg());
-      if (Ty.getSizeInBits() == 8 || Ty.getSizeInBits() == 16)
-        MappingID = 1;
+      if (Ty.getSizeInBits() == 8 || Ty.getSizeInBits() == 16) {
+        // Calls applyMappingImpl()
+        MappingID = CustomMappingID;
+      }
       OpRegBankIdx[2] = PMI_FirstGPR;
     }
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-dup.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-dup.mir
index 4cd6eef531ce0..66c8c2efda9bc 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-dup.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-dup.mir
@@ -16,10 +16,11 @@ body:             |
 
     ; CHECK-LABEL: name: v4s32_gpr
     ; CHECK: liveins: $w0
-    ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
-    ; CHECK: [[DUP:%[0-9]+]]:fpr(<4 x s32>) = G_DUP [[COPY]](s32)
-    ; CHECK: $q0 = COPY [[DUP]](<4 x s32>)
-    ; CHECK: RET_ReallyLR implicit $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
+    ; CHECK-NEXT: [[DUP:%[0-9]+]]:fpr(<4 x s32>) = G_DUP [[COPY]](s32)
+    ; CHECK-NEXT: $q0 = COPY [[DUP]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(s32) = COPY $w0
     %4:_(<4 x s32>) = G_DUP %0(s32)
     $q0 = COPY %4(<4 x s32>)
@@ -37,10 +38,11 @@ body:             |
 
     ; CHECK-LABEL: name: v4s64_gpr
     ; CHECK: liveins: $x0
-    ; CHECK: [[COPY:%[0-9]+]]:gpr(s64) = COPY $x0
-    ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64)
-    ; CHECK: $q0 = COPY [[DUP]](<2 x s64>)
-    ; CHECK: RET_ReallyLR implicit $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr(s64) = COPY $x0
+    ; CHECK-NEXT: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64)
+    ; CHECK-NEXT: $q0 = COPY [[DUP]](<2 x s64>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(s64) = COPY $x0
     %4:_(<2 x s64>) = G_DUP %0(s64)
     $q0 = COPY %4(<2 x s64>)
@@ -58,10 +60,11 @@ body:             |
 
     ; CHECK-LABEL: name: v2s32_gpr
     ; CHECK: liveins: $w0
-    ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
-    ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s32>) = G_DUP [[COPY]](s32)
-    ; CHECK: $d0 = COPY [[DUP]](<2 x s32>)
-    ; CHECK: RET_ReallyLR implicit $d0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
+    ; CHECK-NEXT: [[DUP:%[0-9]+]]:fpr(<2 x s32>) = G_DUP [[COPY]](s32)
+    ; CHECK-NEXT: $d0 = COPY [[DUP]](<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %0:_(s32) = COPY $w0
     %4:_(<2 x s32>) = G_DUP %0(s32)
     $d0 = COPY %4(<2 x s32>)
@@ -79,10 +82,11 @@ body:             |
 
     ; CHECK-LABEL: name: v4s32_fpr
     ; CHECK: liveins: $s0
-    ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0
-    ; CHECK: [[DUP:%[0-9]+]]:fpr(<4 x s32>) = G_DUP [[COPY]](s32)
-    ; CHECK: $q0 = COPY [[DUP]](<4 x s32>)
-    ; CHECK: RET_ReallyLR implicit $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0
+    ; CHECK-NEXT: [[DUP:%[0-9]+]]:fpr(<4 x s32>) = G_DUP [[COPY]](s32)
+    ; CHECK-NEXT: $q0 = COPY [[DUP]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(s32) = COPY $s0
     %4:_(<4 x s32>) = G_DUP %0(s32)
     $q0 = COPY %4(<4 x s32>)
@@ -100,10 +104,11 @@ body:             |
 
     ; CHECK-LABEL: name: v2s64_fpr
     ; CHECK: liveins: $d0
-    ; CHECK: [[COPY:%[0-9]+]]:fpr(s64) = COPY $d0
-    ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64)
-    ; CHECK: $q0 = COPY [[DUP]](<2 x s64>)
-    ; CHECK: RET_ReallyLR implicit $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr(s64) = COPY $d0
+    ; CHECK-NEXT: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64)
+    ; CHECK-NEXT: $q0 = COPY [[DUP]](<2 x s64>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(s64) = COPY $d0
     %4:_(<2 x s64>) = G_DUP %0(s64)
     $q0 = COPY %4(<2 x s64>)
@@ -121,10 +126,11 @@ body:             |
 
     ; CHECK-LABEL: name: v2s32_fpr
     ; CHECK: liveins: $s0
-    ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0
-    ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s32>) = G_DUP [[COPY]](s32)
-    ; CHECK: $d0 = COPY [[DUP]](<2 x s32>)
-    ; CHECK: RET_ReallyLR implicit $d0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr(s32) = COPY $s0
+    ; CHECK-NEXT: [[DUP:%[0-9]+]]:fpr(<2 x s32>) = G_DUP [[COPY]](s32)
+    ; CHECK-NEXT: $d0 = COPY [[DUP]](<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %0:_(s32) = COPY $s0
     %4:_(<2 x s32>) = G_DUP %0(s32)
     $d0 = COPY %4(<2 x s32>)
@@ -142,10 +148,11 @@ body:             |
 
     ; CHECK-LABEL: name: v2s64_fpr_copy
     ; CHECK: liveins: $d0
-    ; CHECK: [[COPY:%[0-9]+]]:fpr(s64) = COPY $d0
-    ; CHECK: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64)
-    ; CHECK: $q0 = COPY [[DUP]](<2 x s64>)
-    ; CHECK: RET_ReallyLR implicit $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr(s64) = COPY $d0
+    ; CHECK-NEXT: [[DUP:%[0-9]+]]:fpr(<2 x s64>) = G_DUP [[COPY]](s64)
+    ; CHECK-NEXT: $q0 = COPY [[DUP]](<2 x s64>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(s64) = COPY $d0
     %6:_(<2 x s64>) = G_DUP %0(s64)
     $q0 = COPY %6(<2 x s64>)
@@ -163,11 +170,13 @@ body:             |
 
     ; CHECK-LABEL: name: v416s8_gpr
     ; CHECK: liveins: $w0
-    ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
-    ; CHECK: %trunc:gpr(s8) = G_TRUNC [[COPY]](s32)
-    ; CHECK: [[DUP:%[0-9]+]]:fpr(<16 x s8>) = G_DUP %trunc(s8)
-    ; CHECK: $q0 = COPY [[DUP]](<16 x s8>)
-    ; CHECK: RET_ReallyLR implicit $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
+    ; CHECK-NEXT: %trunc:gpr(s8) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:gpr(s32) = G_ANYEXT %trunc(s8)
+    ; CHECK-NEXT: [[DUP:%[0-9]+]]:fpr(<16 x s8>) = G_DUP [[ANYEXT]](s32)
+    ; CHECK-NEXT: $q0 = COPY [[DUP]](<16 x s8>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(s32) = COPY $w0
     %trunc:_(s8) = G_TRUNC %0(s32)
     %1:_(<16 x s8>) = G_DUP %trunc(s8)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-dup.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-dup.mir
index 809bdceb4aa25..cf2bab78fe5a6 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-dup.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-dup.mir
@@ -453,3 +453,22 @@ body:             |
     %dup:fpr(<2 x p0>) = G_DUP %cst(p0)
     $q0 = COPY %dup(<2 x p0>)
     RET_ReallyLR implicit $q0
+...
+---
+name:            cstv4i16gpri32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins:
+    ; CHECK-LABEL: name: cstv4i16gpri32
+    ; CHECK: %dup:fpr64 = MOVIv4i16 3, 0
+    ; CHECK-NEXT: $d0 = COPY %dup
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %cst:gpr(s32) = G_CONSTANT i32 3
+    %dup:fpr(<4 x s16>) = G_DUP %cst(s32)
+    $d0 = COPY %dup(<4 x s16>)
+    RET_ReallyLR implicit $d0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index 2bf5419e54830..0291f8c912304 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -103,6 +103,19 @@ define <4 x i32> @v_dupQ32(i32 %A) nounwind {
   ret <4 x i32> %tmp4
 }
 
+define <4 x i16> @v_dup16_const(i16 %y, ptr %p) {
+; CHECK-LABEL: v_dup16_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi.4h v0, #10
+; CHECK-NEXT:    mov w8, #10 // =0xa
+; CHECK-NEXT:    strh w8, [x1]
+; CHECK-NEXT:    ret
+    %i = insertelement <4 x i16> undef, i16 10, i32 0
+    %lo = shufflevector <4 x i16> %i, <4 x i16> undef, <4 x i32> zeroinitializer
+    store i16 10, ptr %p
+    ret <4 x i16> %lo
+}
+
 define <4 x float> @v_dupQfloat(float %A) nounwind {
 ; CHECK-LABEL: v_dupQfloat:
 ; CHECK:       // %bb.0:
@@ -420,9 +433,9 @@ define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) n
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    adrp x8, .LCPI33_0
+; CHECK-GI-NEXT:    adrp x8, .LCPI34_0
 ; CHECK-GI-NEXT:    mov.d v0[1], v1[0]
-; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI33_0]
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI34_0]
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0 }, v1
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -443,9 +456,9 @@ define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    adrp x8, .LCPI34_0
+; CHECK-GI-NEXT:    adrp x8, .LCPI35_0
 ; CHECK-GI-NEXT:    mov.d v0[1], v1[0]
-; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI34_0]
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI35_0]
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0 }, v1
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -462,9 +475,9 @@ define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) n
 ;
 ; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI35_0
+; CHECK-GI-NEXT:    adrp x8, .LCPI36_0
 ; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI35_0]
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI36_0]
 ; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
 ; CHECK-GI-NEXT:    ret
@@ -481,9 +494,9 @@ define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float>
 ;
 ; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4f32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI36_0
+; CHECK-GI-NEXT:    adrp x8, .LCPI37_0
 ; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI36_0]
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI37_0]
 ; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
 ; CHECK-GI-NEXT:    ret
@@ -503,12 +516,12 @@ define void @disguised_dup(<4 x float> %x, ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: disguised_dup:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI37_1
+; CHECK-GI-NEXT:    adrp x8, .LCPI38_1
 ; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI37_1]
-; CHECK-GI-NEXT:    adrp x8, .LCPI37_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI38_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI38_0
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI37_0]
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI38_0]
 ; CHECK-GI-NEXT:    tbl.16b v2, { v0, v1 }, v2
 ; CHECK-GI-NEXT:    str q0, [x0]
 ; CHECK-GI-NEXT:    str q2, [x1]
@@ -531,8 +544,8 @@ define <2 x i32> @dup_const2(<2 x i32> %A) nounwind {
 ;
 ; CHECK-GI-LABEL: dup_const2:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI38_0
-; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI38_0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI39_0
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI39_0]
 ; CHECK-GI-NEXT:    add.2s v0, v0, v1
 ; CHECK-GI-NEXT:    ret
   %tmp2 = add <2 x i32> %A, <i32 8421378, i32 8421378>
@@ -550,8 +563,8 @@ define <2 x i32> @dup_const4_ext(<4 x i32> %A) nounwind {
 ;
 ; CHECK-GI-LABEL: dup_const4_ext:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI39_0
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI39_0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI40_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI40_0]
 ; CHECK-GI-NEXT:    add.4s v0, v0, v1
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -575,12 +588,12 @@ define <4 x i32> @dup_const24(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C) nounwind
 ;
 ; CHECK-GI-LABEL: dup_const24:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI40_1
+; CHECK-GI-NEXT:    adrp x8, .LCPI41_1
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    ldr d3, [x8, :lo12:.LCPI40_1]
-; CHECK-GI-NEXT:    adrp x8, .LCPI40_0
+; CHECK-GI-NEXT:    ldr d3, [x8, :lo12:.LCPI41_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI41_0
 ; CHECK-GI-NEXT:    add.2s v0, v0, v3
-; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI40_0]
+; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI41_0]
 ; CHECK-GI-NEXT:    mov.d v0[1], v1[0]
 ; CHECK-GI-NEXT:    add.4s v1, v2, v3
 ; CHECK-GI-NEXT:    eor.16b v0, v1, v0
@@ -687,3 +700,17 @@ define <8 x i16> @bitcast_v2f64_v8i16(<2 x i64> %a) {
   ret <8 x i16> %r
 }
 
+define <4 x i16> @dup_i16_v4i16_constant() {
+; CHECK-SD-LABEL: dup_i16_v4i16_constant:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #9211 // =0x23fb
+; CHECK-SD-NEXT:    dup.4h v0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: dup_i16_v4i16_constant:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI50_0
+; CHECK-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI50_0]
+; CHECK-GI-NEXT:    ret
+    ret <4 x i16> <i16 9211, i16 9211, i16 9211, i16 9211>
+}
diff --git a/llvm/test/CodeGen/AArch64/neon-mov.ll b/llvm/test/CodeGen/AArch64/neon-mov.ll
index 170ba7292ae60..ff80ff097b28f 100644
--- a/llvm/test/CodeGen/AArch64/neon-mov.ll
+++ b/llvm/test/CodeGen/AArch64/neon-mov.ll
@@ -109,29 +109,11 @@ define <4 x i32> @movi4s_lsl16() {
 }
 
 define <4 x i32> @movi4s_fneg() {
-; CHECK-NOFP16-SD-LABEL: movi4s_fneg:
-; CHECK-NOFP16-SD:       // %bb.0:
-; CHECK-NOFP16-SD-NEXT:    movi v0.4s, #240, lsl #8
-; CHECK-NOFP16-SD-NEXT:    fneg v0.4s, v0.4s
-; CHECK-NOFP16-SD-NEXT:    ret
-;
-; CHECK-FP16-SD-LABEL: movi4s_fneg:
-; CHECK-FP16-SD:       // %bb.0:
-; CHECK-FP16-SD-NEXT:    movi v0.4s, #240, lsl #8
-; CHECK-FP16-SD-NEXT:    fneg v0.4s, v0.4s
-; CHECK-FP16-SD-NEXT:    ret
-;
-; CHECK-NOFP16-GI-LABEL: movi4s_fneg:
-; CHECK-NOFP16-GI:       // %bb.0:
-; CHECK-NOFP16-GI-NEXT:    movi v0.4s, #240, lsl #8
-; CHECK-NOFP16-GI-NEXT:    fneg v0.4s, v0.4s
-; CHECK-NOFP16-GI-NEXT:    ret
-;
-; CHECK-FP16-GI-LABEL: movi4s_fneg:
-; CHECK-FP16-GI:       // %bb.0:
-; CHECK-FP16-GI-NEXT:    movi v0.4s, #240, lsl #8
-; CHECK-FP16-GI-NEXT:    fneg v0.4s, v0.4s
-; CHECK-FP16-GI-NEXT:    ret
+; CHECK-LABEL: movi4s_fneg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.4s, #240, lsl #8
+; CHECK-NEXT:    fneg v0.4s, v0.4s
+; CHECK-NEXT:    ret
    ret <4 x i32> <i32 2147545088, i32 2147545088, i32 2147545088, i32 2147545088>
 }
 
@@ -308,23 +290,17 @@ define <8 x i16> @mvni8h_neg() {
 ; CHECK-NOFP16-SD-NEXT:    dup v0.8h, w8
 ; CHECK-NOFP16-SD-NEXT:    ret
 ;
-; CHECK-FP16-SD-LABEL: mvni8h_neg:
-; CHECK-FP16-SD:       // %bb.0:
-; CHECK-FP16-SD-NEXT:    movi v0.8h, #240
-; CHECK-FP16-SD-NEXT:    fneg v0.8h, v0.8h
-; CHECK-FP16-SD-NEXT:    ret
+; CHECK-FP16-LABEL: mvni8h_neg:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    movi v0.8h, #240
+; CHECK-FP16-NEXT:    fneg v0.8h, v0.8h
+; CHECK-FP16-NEXT:    ret
 ;
 ; CHECK-NOFP16-GI-LABEL: mvni8h_neg:
 ; CHECK-NOFP16-GI:       // %bb.0:
 ; CHECK-NOFP16-GI-NEXT:    adrp x8, .LCPI32_0
 ; CHECK-NOFP16-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI32_0]
 ; CHECK-NOFP16-GI-NEXT:    ret
-;
-; CHECK-FP16-GI-LABEL: mvni8h_neg:
-; CHECK-FP16-GI:       // %bb.0:
-; CHECK-FP16-GI-NEXT:    movi v0.8h, #240
-; CHECK-FP16-GI-NEXT:    fneg v0.8h, v0.8h
-; CHECK-FP16-GI-NEXT:    ret
    ret <8 x i16> <i16 33008, i16 33008, i16 33008, i16 33008, i16 33008, i16 33008, i16 33008, i16 33008>
 }
 
@@ -494,29 +470,11 @@ define <2 x double> @fmov2d() {
 }
 
 define <2 x double> @fmov2d_neg0() {
-; CHECK-NOFP16-SD-LABEL: fmov2d_neg0:
-; CHECK-NOFP16-SD:       // %bb.0:
-; CHECK-NOFP16-SD-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NOFP16-SD-NEXT:    fneg v0.2d, v0.2d
-; CHECK-NOFP16-SD-NEXT:    ret
-;
-; CHECK-FP16-SD-LABEL: fmov2d_neg0:
-; CHECK-FP16-SD:       // %bb.0:
-; CHECK-FP16-SD-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-FP16-SD-NEXT:    fneg v0.2d, v0.2d
-; CHECK-FP16-SD-NEXT:    ret
-;
-; CHECK-NOFP16-GI-LABEL: fmov2d_neg0:
-; CHECK-NOFP16-GI:       // %bb.0:
-; CHECK-NOFP16-GI-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NOFP16-GI-NEXT:    fneg v0.2d, v0.2d
-; CHECK-NOFP16-GI-NEXT:    ret
-;
-; CHECK-FP16-GI-LABEL: fmov2d_neg0:
-; CHECK-FP16-GI:       // %bb.0:
-; CHECK-FP16-GI-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-FP16-GI-NEXT:    fneg v0.2d, v0.2d
-; CHECK-FP16-GI-NEXT:    ret
+; CHECK-LABEL: fmov2d_neg0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    fneg v0.2d, v0.2d
+; CHECK-NEXT:    ret
 	ret <2 x double> <double -0.0, double -0.0>
 }
 
@@ -581,5 +539,4 @@ define <2 x i32> @movi1d() {
   ret <2 x i32> %1
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-FP16: {{.*}}
 ; CHECK-NOFP16: {{.*}}

>From ba461f8c6278a7b2fd7695454c1f184c38897ecd Mon Sep 17 00:00:00 2001
From: Him188 <tguan at nvidia.com>
Date: Thu, 25 Jul 2024 11:07:31 +0100
Subject: [PATCH 117/259] [AArch64][GlobalISel] Legalize fp128 types as
 libcalls for G_FCMP (#98452)

- Generate libcall for supported predicates.
- Generate unsupported predicates as combinations of supported
predicates.
- Vectors are scalarized, however some cases like `v3f128_fp128` are still failing, because we failed to legalize G_OR for these types.

GISel now generates the same code as SDAG, however, note the difference
in the `one` case.
---
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |   3 +
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    | 154 ++++-
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   6 +-
 llvm/test/CodeGen/AArch64/arm64-ccmp.ll       |  70 +-
 llvm/test/CodeGen/AArch64/fcmp-fp128.ll       | 560 ++++++++++++++++
 llvm/test/CodeGen/AArch64/fcmp.ll             | 615 +++++++++++-------
 6 files changed, 1153 insertions(+), 255 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/fcmp-fp128.ll

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index cf1ca1dc34024..0b2cd299bde12 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -280,6 +280,9 @@ class LegalizerHelper {
   LegalizeResult createResetStateLibcall(MachineIRBuilder &MIRBuilder,
                                          MachineInstr &MI,
                                          LostDebugLocObserver &LocObserver);
+  LegalizeResult createFCMPLibcall(MachineIRBuilder &MIRBuilder,
+                                   MachineInstr &MI,
+                                   LostDebugLocObserver &LocObserver);
 
   MachineInstrBuilder
   getNeutralElementForVecReduce(unsigned Opcode, MachineIRBuilder &MIRBuilder,
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index b490ab2fc7617..225ec19246231 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -735,8 +735,7 @@ static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
   if (MemType.isVector())
     return RTLIB::UNKNOWN_LIBCALL;
 
-#define LCALLS(A, B)                                                           \
-  { A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
+#define LCALLS(A, B) {A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL}
 #define LCALL5(A)                                                              \
   LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
   switch (Opc) {
@@ -992,6 +991,150 @@ LegalizerHelper::createSetStateLibcall(MachineIRBuilder &MIRBuilder,
                        LocObserver, nullptr);
 }
 
+/// Returns the corresponding libcall for the given Pred and
+/// the ICMP predicate that should be generated to compare with #0
+/// after the libcall.
+static std::pair<RTLIB::Libcall, CmpInst::Predicate>
+getFCMPLibcallDesc(const CmpInst::Predicate Pred) {
+
+  switch (Pred) {
+  case CmpInst::FCMP_OEQ:
+    return {RTLIB::OEQ_F128, CmpInst::ICMP_EQ};
+  case CmpInst::FCMP_UNE:
+    return {RTLIB::UNE_F128, CmpInst::ICMP_NE};
+  case CmpInst::FCMP_OGE:
+    return {RTLIB::OGE_F128, CmpInst::ICMP_SGE};
+  case CmpInst::FCMP_OLT:
+    return {RTLIB::OLT_F128, CmpInst::ICMP_SLT};
+  case CmpInst::FCMP_OLE:
+    return {RTLIB::OLE_F128, CmpInst::ICMP_SLE};
+  case CmpInst::FCMP_OGT:
+    return {RTLIB::OGT_F128, CmpInst::ICMP_SGT};
+  case CmpInst::FCMP_UNO:
+    return {RTLIB::UO_F128, CmpInst::ICMP_NE};
+  default:
+    return {RTLIB::UNKNOWN_LIBCALL, CmpInst::BAD_ICMP_PREDICATE};
+  }
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
+                                   MachineInstr &MI,
+                                   LostDebugLocObserver &LocObserver) {
+  auto &MF = MIRBuilder.getMF();
+  auto &Ctx = MF.getFunction().getContext();
+  const GFCmp *Cmp = cast<GFCmp>(&MI);
+
+  LLT OpLLT = MRI.getType(Cmp->getLHSReg());
+  if (OpLLT != LLT::scalar(128) || OpLLT != MRI.getType(Cmp->getRHSReg()))
+    return UnableToLegalize;
+
+  Type *OpType = getFloatTypeForLLT(Ctx, OpLLT);
+
+  // DstReg type is s32
+  const Register DstReg = Cmp->getReg(0);
+  const auto Cond = Cmp->getCond();
+
+  // Reference:
+  // https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html#Comparison-functions-1
+  // Generates a libcall followed by ICMP.
+  const auto BuildLibcall =
+      [&](const RTLIB::Libcall Libcall, const CmpInst::Predicate ICmpPred,
+          const DstOp &Res = LLT::scalar(32)) -> Register {
+    // FCMP libcall always returns an i32, and needs an ICMP with #0.
+    constexpr LLT TempLLT = LLT::scalar(32);
+    Register Temp = MRI.createGenericVirtualRegister(TempLLT);
+    // Generate libcall, holding result in Temp
+    const auto Status = createLibcall(
+        MIRBuilder, Libcall, {Temp, Type::getInt32Ty(Ctx), 0},
+        {{Cmp->getLHSReg(), OpType, 0}, {Cmp->getRHSReg(), OpType, 1}},
+        LocObserver, &MI);
+    if (!Status)
+      return {};
+
+    // Compare temp with #0 to get the final result.
+    return MIRBuilder
+        .buildICmp(ICmpPred, Res, Temp, MIRBuilder.buildConstant(TempLLT, 0))
+        .getReg(0);
+  };
+
+  // Simple case if we have a direct mapping from predicate to libcall
+  if (const auto [Libcall, ICmpPred] = getFCMPLibcallDesc(Cond);
+      Libcall != RTLIB::UNKNOWN_LIBCALL &&
+      ICmpPred != CmpInst::BAD_ICMP_PREDICATE) {
+    if (BuildLibcall(Libcall, ICmpPred, DstReg)) {
+      return Legalized;
+    }
+    return UnableToLegalize;
+  }
+
+  // No direct mapping found, should be generated as combination of libcalls.
+
+  switch (Cond) {
+  case CmpInst::FCMP_UEQ: {
+    // FCMP_UEQ: unordered or equal
+    // Convert into (FCMP_OEQ || FCMP_UNO).
+
+    const auto [OeqLibcall, OeqPred] = getFCMPLibcallDesc(CmpInst::FCMP_OEQ);
+    const auto Oeq = BuildLibcall(OeqLibcall, OeqPred);
+
+    const auto [UnoLibcall, UnoPred] = getFCMPLibcallDesc(CmpInst::FCMP_UNO);
+    const auto Uno = BuildLibcall(UnoLibcall, UnoPred);
+    if (Oeq && Uno)
+      MIRBuilder.buildOr(DstReg, Oeq, Uno);
+    else
+      return UnableToLegalize;
+
+    break;
+  }
+  case CmpInst::FCMP_ONE: {
+    // FCMP_ONE: ordered and operands are unequal
+    // Convert into (!FCMP_OEQ && !FCMP_UNO).
+
+    // We inverse the predicate instead of generating a NOT
+    // to save one instruction.
+    // On AArch64 isel can even select two cmp into a single ccmp.
+    const auto [OeqLibcall, OeqPred] = getFCMPLibcallDesc(CmpInst::FCMP_OEQ);
+    const auto NotOeq =
+        BuildLibcall(OeqLibcall, CmpInst::getInversePredicate(OeqPred));
+
+    const auto [UnoLibcall, UnoPred] = getFCMPLibcallDesc(CmpInst::FCMP_UNO);
+    const auto NotUno =
+        BuildLibcall(UnoLibcall, CmpInst::getInversePredicate(UnoPred));
+
+    if (NotOeq && NotUno)
+      MIRBuilder.buildAnd(DstReg, NotOeq, NotUno);
+    else
+      return UnableToLegalize;
+
+    break;
+  }
+  case CmpInst::FCMP_ULT:
+  case CmpInst::FCMP_UGE:
+  case CmpInst::FCMP_UGT:
+  case CmpInst::FCMP_ULE:
+  case CmpInst::FCMP_ORD: {
+    // Convert into: !(inverse(Pred))
+    // E.g. FCMP_ULT becomes !FCMP_OGE
+    // This is equivalent to the following, but saves some instructions.
+    //   MIRBuilder.buildNot(
+    //       PredTy,
+    //       MIRBuilder.buildFCmp(CmpInst::getInversePredicate(Pred), PredTy,
+    //                            Op1, Op2));
+    const auto [InversedLibcall, InversedPred] =
+        getFCMPLibcallDesc(CmpInst::getInversePredicate(Cond));
+    if (!BuildLibcall(InversedLibcall,
+                      CmpInst::getInversePredicate(InversedPred), DstReg))
+      return UnableToLegalize;
+    break;
+  }
+  default:
+    return UnableToLegalize;
+  }
+
+  return Legalized;
+}
+
 // The function is used to legalize operations that set default environment
 // state. In C library a call like `fesetmode(FE_DFL_MODE)` is used for that.
 // On most targets supported in glibc FE_DFL_MODE is defined as
@@ -1138,6 +1281,13 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
       return Status;
     break;
   }
+  case TargetOpcode::G_FCMP: {
+    LegalizeResult Status = createFCMPLibcall(MIRBuilder, MI, LocObserver);
+    if (Status != Legalized)
+      return Status;
+    MI.eraseFromParent();
+    return Status;
+  }
   case TargetOpcode::G_FPTOSI:
   case TargetOpcode::G_FPTOUI: {
     // FIXME: Support other types
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index c1c51660ecf1c..d3c5742cee3eb 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -561,7 +561,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       })
       .widenScalarOrEltToNextPow2(1)
       .clampScalar(0, s32, s32)
-      .clampScalarOrElt(1, MinFPScalar, s64)
+      .minScalarOrElt(1, MinFPScalar)
+      .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
       .minScalarEltSameAsIf(
           [=](const LegalityQuery &Query) {
             const LLT &Ty = Query.Types[0];
@@ -573,7 +574,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampNumElements(1, v4s16, v8s16)
       .clampNumElements(1, v2s32, v4s32)
       .clampMaxNumElements(1, s64, 2)
-      .moreElementsToNextPow2(1);
+      .moreElementsToNextPow2(1)
+      .libcallFor({{s32, s128}});
 
   // Extensions
   auto ExtLegalFunc = [=](const LegalityQuery &Query) {
diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
index b6702bba1598c..50afc79a5a576 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp | FileCheck %s --check-prefixes=CHECK,SDISEL
-; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp -global-isel | FileCheck %s --check-prefixes=CHECK,GISEL
 target triple = "arm64-apple-ios"
 
 define i32 @single_same(i32 %a, i32 %b) nounwind ssp {
@@ -950,29 +950,51 @@ define i32 @half_select_and_olt_one(half %v0, half %v1, half %v2, half %v3, i32
 ; Also verify that we don't try to generate f128 FCCMPs, using RT calls instead.
 
 define i32 @f128_select_and_olt_oge(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 %b) #0 {
-; CHECK-LABEL: f128_select_and_olt_oge:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    stp x22, x21, [sp, #32] ; 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #48] ; 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] ; 16-byte Folded Spill
-; CHECK-NEXT:    mov x19, x1
-; CHECK-NEXT:    mov x20, x0
-; CHECK-NEXT:    stp q2, q3, [sp] ; 32-byte Folded Spill
-; CHECK-NEXT:    bl ___lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w21, lt
-; CHECK-NEXT:    ldp q0, q1, [sp] ; 32-byte Folded Reload
-; CHECK-NEXT:    bl ___getf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w8, ge
-; CHECK-NEXT:    tst w8, w21
-; CHECK-NEXT:    csel w0, w20, w19, ne
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #80
-; CHECK-NEXT:    ret
+; SDISEL-LABEL: f128_select_and_olt_oge:
+; SDISEL:       ; %bb.0:
+; SDISEL-NEXT:    sub sp, sp, #80
+; SDISEL-NEXT:    stp x22, x21, [sp, #32] ; 16-byte Folded Spill
+; SDISEL-NEXT:    stp x20, x19, [sp, #48] ; 16-byte Folded Spill
+; SDISEL-NEXT:    stp x29, x30, [sp, #64] ; 16-byte Folded Spill
+; SDISEL-NEXT:    mov x19, x1
+; SDISEL-NEXT:    mov x20, x0
+; SDISEL-NEXT:    stp q2, q3, [sp] ; 32-byte Folded Spill
+; SDISEL-NEXT:    bl ___lttf2
+; SDISEL-NEXT:    cmp w0, #0
+; SDISEL-NEXT:    cset w21, lt
+; SDISEL-NEXT:    ldp q0, q1, [sp] ; 32-byte Folded Reload
+; SDISEL-NEXT:    bl ___getf2
+; SDISEL-NEXT:    cmp w0, #0
+; SDISEL-NEXT:    cset w8, ge
+; SDISEL-NEXT:    tst w8, w21
+; SDISEL-NEXT:    csel w0, w20, w19, ne
+; SDISEL-NEXT:    ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
+; SDISEL-NEXT:    ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
+; SDISEL-NEXT:    ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
+; SDISEL-NEXT:    add sp, sp, #80
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: f128_select_and_olt_oge:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #80
+; GISEL-NEXT:    stp x22, x21, [sp, #32] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x20, x19, [sp, #48] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x29, x30, [sp, #64] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp q3, q2, [sp] ; 32-byte Folded Spill
+; GISEL-NEXT:    mov x19, x0
+; GISEL-NEXT:    mov x20, x1
+; GISEL-NEXT:    bl ___lttf2
+; GISEL-NEXT:    mov x21, x0
+; GISEL-NEXT:    ldp q1, q0, [sp] ; 32-byte Folded Reload
+; GISEL-NEXT:    bl ___getf2
+; GISEL-NEXT:    cmp w21, #0
+; GISEL-NEXT:    ccmp w0, #0, #8, lt
+; GISEL-NEXT:    csel w0, w19, w20, ge
+; GISEL-NEXT:    ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
+; GISEL-NEXT:    add sp, sp, #80
+; GISEL-NEXT:    ret
   %c0 = fcmp olt fp128 %v0, %v1
   %c1 = fcmp oge fp128 %v2, %v3
   %cr = and i1 %c1, %c0
diff --git a/llvm/test/CodeGen/AArch64/fcmp-fp128.ll b/llvm/test/CodeGen/AArch64/fcmp-fp128.ll
new file mode 100644
index 0000000000000..503cb8c533bab
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fcmp-fp128.ll
@@ -0,0 +1,560 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK-GI
+
+; Checks generated libcalls for fp128 types
+
+define double @oeq(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: oeq:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __eqtf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, eq
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: oeq:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __eqtf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, eq
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp oeq fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ogt(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ogt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __gttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, gt
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ogt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, gt
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ogt fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @olt(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: olt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, lt
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: olt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, lt
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp olt fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ole(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ole:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __letf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, le
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ole:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __letf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, le
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ole fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @one(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: one:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl __eqtf2
+; CHECK-SD-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    cset w19, ne
+; CHECK-SD-NEXT:    bl __unordtf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    cset w8, eq
+; CHECK-SD-NEXT:    tst w8, w19
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, ne
+; CHECK-SD-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: one:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    stp q1, q0, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __eqtf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    bl __unordtf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ccmp w19, #0, #4, eq
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, ne
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #64
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp one fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ord(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ord:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __unordtf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, eq
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ord:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __unordtf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, eq
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ord fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @uno(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: uno:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __unordtf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, ne
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uno:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __unordtf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, ne
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp uno fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ueq(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ueq:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl __eqtf2
+; CHECK-SD-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    mov w19, w0
+; CHECK-SD-NEXT:    bl __unordtf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ccmp w19, #0, #4, eq
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, eq
+; CHECK-SD-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ueq:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    stp q1, q0, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __eqtf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    bl __unordtf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ccmp w19, #0, #4, eq
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, eq
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #64
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ueq fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ugt(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ugt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __letf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, gt
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ugt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __letf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, gt
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ugt fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @uge(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: uge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, ge
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, ge
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp uge fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ult(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ult:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __getf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, lt
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ult:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, lt
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ult fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ule(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ule:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __gttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, le
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ule:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, le
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ule fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @une(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: une:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __netf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, ne
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: une:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __netf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, ne
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp une fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index a76d0b36fa1aa..a5d7ae147ffda 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -4,37 +4,49 @@
 ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
-
-; CHECK-GI:      warning: Instruction selection used fallback path for f128_fp128
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_i128
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_double
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_float
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_i32
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_half
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v2f128_fp128
+; CHECK-GI: warning: Instruction selection used fallback path for v2f128_fp128
 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3f128_fp128
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v2f128_double
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3f128_double
-
 
 define fp128 @f128_fp128(fp128 %a, fp128 %b, fp128 %d, fp128 %e) {
-; CHECK-LABEL: f128_fp128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    stp q2, q3, [sp] // 32-byte Folded Spill
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    b.ge .LBB0_2
-; CHECK-NEXT:  // %bb.1: // %entry
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:  .LBB0_2: // %entry
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #48
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: f128_fp128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    stp q2, q3, [sp] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    b.ge .LBB0_2
+; CHECK-SD-NEXT:  // %bb.1: // %entry
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:  .LBB0_2: // %entry
+; CHECK-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: f128_fp128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #48
+; CHECK-GI-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    stp q3, q2, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov d0, v2.d[1]
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fcsel d2, d2, d3, lt
+; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    fcsel d1, d0, d1, lt
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    add sp, sp, #48
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt fp128 %a, %b
   %s = select i1 %c, fp128 %d, fp128 %e
@@ -42,37 +54,61 @@ entry:
 }
 
 define i128 @f128_i128(fp128 %a, fp128 %b, i128 %d, i128 %e) {
-; CHECK-LABEL: f128_i128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w30, -48
-; CHECK-NEXT:    mov x19, x3
-; CHECK-NEXT:    mov x20, x2
-; CHECK-NEXT:    mov x21, x1
-; CHECK-NEXT:    mov x22, x0
-; CHECK-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    csel x20, x22, x20, lt
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    mov x0, x20
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel x1, x21, x19, lt
-; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #80
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: f128_i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #80
+; CHECK-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NEXT:    mov x19, x3
+; CHECK-SD-NEXT:    mov x20, x2
+; CHECK-SD-NEXT:    mov x21, x1
+; CHECK-SD-NEXT:    mov x22, x0
+; CHECK-SD-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    csel x20, x22, x20, lt
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    mov w8, w0
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w8, #0
+; CHECK-SD-NEXT:    csel x1, x21, x19, lt
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #80
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: f128_i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    mov x19, x0
+; CHECK-GI-NEXT:    mov x20, x1
+; CHECK-GI-NEXT:    mov x21, x2
+; CHECK-GI-NEXT:    mov x22, x3
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    csel x0, x19, x21, lt
+; CHECK-GI-NEXT:    csel x1, x20, x22, lt
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt fp128 %a, %b
   %s = select i1 %c, i128 %d, i128 %e
@@ -80,22 +116,39 @@ entry:
 }
 
 define double @f128_double(fp128 %a, fp128 %b, double %d, double %e) {
-; CHECK-LABEL: f128_double:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    .cfi_offset b8, -24
-; CHECK-NEXT:    .cfi_offset b9, -32
-; CHECK-NEXT:    fmov d8, d3
-; CHECK-NEXT:    fmov d9, d2
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    fcsel d0, d9, d8, lt
-; CHECK-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: f128_double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, lt
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: f128_double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, lt
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt fp128 %a, %b
   %s = select i1 %c, double %d, double %e
@@ -103,22 +156,39 @@ entry:
 }
 
 define float @f128_float(fp128 %a, fp128 %b, float %d, float %e) {
-; CHECK-LABEL: f128_float:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    .cfi_offset b8, -24
-; CHECK-NEXT:    .cfi_offset b9, -32
-; CHECK-NEXT:    fmov s8, s3
-; CHECK-NEXT:    fmov s9, s2
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    fcsel s0, s9, s8, lt
-; CHECK-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: f128_float:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov s8, s3
+; CHECK-SD-NEXT:    fmov s9, s2
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel s0, s9, s8, lt
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: f128_float:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov s8, s2
+; CHECK-GI-NEXT:    fmov s9, s3
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel s0, s8, s9, lt
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt fp128 %a, %b
   %s = select i1 %c, float %d, float %e
@@ -126,22 +196,39 @@ entry:
 }
 
 define i32 @f128_i32(fp128 %a, fp128 %b, i32 %d, i32 %e) {
-; CHECK-LABEL: f128_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    mov w19, w1
-; CHECK-NEXT:    mov w20, w0
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    csel w0, w20, w19, lt
-; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: f128_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w30, -32
+; CHECK-SD-NEXT:    mov w19, w1
+; CHECK-SD-NEXT:    mov w20, w0
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    csel w0, w20, w19, lt
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: f128_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    mov w20, w1
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    csel w0, w19, w20, lt
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt fp128 %a, %b
   %s = select i1 %c, i32 %d, i32 %e
@@ -184,40 +271,26 @@ define half @f128_half(fp128 %a, fp128 %b, half %d, half %e) {
 ; CHECK-SD-FP16-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
 ; CHECK-SD-FP16-NEXT:    ret
 ;
-; CHECK-GI-NOFP16-LABEL: f128_half:
-; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-GI-NOFP16-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NOFP16-NEXT:    .cfi_offset b8, -24
-; CHECK-GI-NOFP16-NEXT:    .cfi_offset b9, -32
-; CHECK-GI-NOFP16-NEXT:    fmov s8, s3
-; CHECK-GI-NOFP16-NEXT:    fmov s9, s2
-; CHECK-GI-NOFP16-NEXT:    bl __lttf2
-; CHECK-GI-NOFP16-NEXT:    cmp w0, #0
-; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    fcsel s0, s9, s8, lt
-; CHECK-GI-NOFP16-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; CHECK-GI-NOFP16-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    ret
-;
-; CHECK-GI-FP16-LABEL: f128_half:
-; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-GI-FP16-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-FP16-NEXT:    .cfi_offset b8, -24
-; CHECK-GI-FP16-NEXT:    .cfi_offset b9, -32
-; CHECK-GI-FP16-NEXT:    fmov s8, s3
-; CHECK-GI-FP16-NEXT:    fmov s9, s2
-; CHECK-GI-FP16-NEXT:    bl __lttf2
-; CHECK-GI-FP16-NEXT:    cmp w0, #0
-; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    fcsel h0, h9, h8, lt
-; CHECK-GI-FP16-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    ret
+; CHECK-GI-LABEL: f128_half:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov s8, s2
+; CHECK-GI-NEXT:    fmov s9, s3
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    fmov w8, s8
+; CHECK-GI-NEXT:    fmov w9, s9
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    csel w8, w8, w9, lt
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt fp128 %a, %b
   %s = select i1 %c, half %d, half %e
@@ -436,37 +509,64 @@ entry:
 
 
 define <2 x double> @v2f128_double(<2 x fp128> %a, <2 x fp128> %b, <2 x double> %d, <2 x double> %e) {
-; CHECK-LABEL: v2f128_double:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #96
-; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v1.16b, v3.16b
-; CHECK-NEXT:    stp q4, q5, [sp, #48] // 32-byte Folded Spill
-; CHECK-NEXT:    str q2, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    ldp q2, q1, [sp, #48] // 32-byte Folded Reload
-; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
-; CHECK-NEXT:    add sp, sp, #96
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2f128_double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #96
+; CHECK-SD-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    stp q4, q5, [sp, #48] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    str q2, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    cset w8, lt
+; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    cset w8, lt
+; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    ldp q2, q1, [sp, #48] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    add sp, sp, #96
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2f128_double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #80
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    stp q3, q1, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    mov v1.16b, v2.16b
+; CHECK-GI-NEXT:    stp q5, q4, [sp, #32] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    cset w19, lt
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    fmov d0, x19
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    ldp q2, q1, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #63
+; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #63
+; CHECK-GI-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    add sp, sp, #80
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt <2 x fp128> %a, %b
   %s = select <2 x i1> %c, <2 x double> %d, <2 x double> %e
@@ -474,61 +574,122 @@ entry:
 }
 
 define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double> %d, <3 x double> %e) {
-; CHECK-LABEL: v3f128_double:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #160
-; CHECK-NEXT:    str x30, [sp, #144] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 160
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    stp q2, q5, [sp, #112] // 32-byte Folded Spill
-; CHECK-NEXT:    // kill: def $d6 killed $d6 def $q6
-; CHECK-NEXT:    // kill: def $d7 killed $d7 def $q7
-; CHECK-NEXT:    ldr d5, [sp, #184]
-; CHECK-NEXT:    str q3, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    ldp d3, d2, [sp, #168]
-; CHECK-NEXT:    mov v6.d[1], v7.d[0]
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v1.16b, v4.16b
-; CHECK-NEXT:    str q5, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr d5, [sp, #160]
-; CHECK-NEXT:    mov v3.d[1], v2.d[0]
-; CHECK-NEXT:    str q5, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    stp q6, q3, [sp, #32] // 32-byte Folded Spill
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    mov v1.d[1], v0.d[0]
-; CHECK-NEXT:    str q1, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    ldp q0, q1, [sp, #112] // 32-byte Folded Reload
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    ldp q1, q0, [sp, #32] // 32-byte Folded Reload
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldp q2, q4, [sp, #64] // 32-byte Folded Reload
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    ldr q3, [sp, #96] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
-; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
-; CHECK-NEXT:    fmov d2, x8
-; CHECK-NEXT:    bsl v2.16b, v4.16b, v3.16b
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q2
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT:    add sp, sp, #160
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3f128_double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #160
+; CHECK-SD-NEXT:    str x30, [sp, #144] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 160
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    stp q2, q5, [sp, #112] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    // kill: def $d6 killed $d6 def $q6
+; CHECK-SD-NEXT:    // kill: def $d7 killed $d7 def $q7
+; CHECK-SD-NEXT:    ldr d5, [sp, #184]
+; CHECK-SD-NEXT:    str q3, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldp d3, d2, [sp, #168]
+; CHECK-SD-NEXT:    mov v6.d[1], v7.d[0]
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    mov v1.16b, v4.16b
+; CHECK-SD-NEXT:    str q5, [sp, #96] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr d5, [sp, #160]
+; CHECK-SD-NEXT:    mov v3.d[1], v2.d[0]
+; CHECK-SD-NEXT:    str q5, [sp, #80] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp q6, q3, [sp, #32] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    cset w8, lt
+; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    cset w8, lt
+; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-SD-NEXT:    fmov d1, x8
+; CHECK-SD-NEXT:    mov v1.d[1], v0.d[0]
+; CHECK-SD-NEXT:    str q1, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldp q0, q1, [sp, #112] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    ldp q1, q0, [sp, #32] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldp q2, q4, [sp, #64] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    cset w8, lt
+; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-SD-NEXT:    ldr q3, [sp, #96] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    fmov d2, x8
+; CHECK-SD-NEXT:    bsl v2.16b, v4.16b, v3.16b
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    add sp, sp, #160
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3f128_double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #176
+; CHECK-GI-NEXT:    str x30, [sp, #128] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #144] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #160] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 176
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    stp q4, q1, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    mov v1.16b, v3.16b
+; CHECK-GI-NEXT:    ldr x19, [sp, #176]
+; CHECK-GI-NEXT:    stp q5, q2, [sp, #32] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    ldr d2, [sp, #184]
+; CHECK-GI-NEXT:    ldr x20, [sp, #200]
+; CHECK-GI-NEXT:    // kill: def $d6 killed $d6 def $q6
+; CHECK-GI-NEXT:    // kill: def $d7 killed $d7 def $q7
+; CHECK-GI-NEXT:    str q7, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q2, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    ldr d2, [sp, #192]
+; CHECK-GI-NEXT:    stp q6, q2, [sp, #80] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    cset w21, lt
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    cset w22, lt
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q0, q2, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    sbfx x8, x21, #0, #1
+; CHECK-GI-NEXT:    ldp q4, q3, [sp, #96] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    sbfx x9, x22, #0, #1
+; CHECK-GI-NEXT:    fmov d1, x8
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #128] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v2.d[1], v0.d[0]
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-GI-NEXT:    mov v1.d[1], x9
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #144] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    and x9, x19, x8
+; CHECK-GI-NEXT:    bic x8, x20, x8
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #160] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    orr x8, x9, x8
+; CHECK-GI-NEXT:    bic v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    add sp, sp, #176
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt <3 x fp128> %a, %b
   %s = select <3 x i1> %c, <3 x double> %d, <3 x double> %e

>From b72689a5cbd8645c183476cd87e32948308c5b64 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 25 Jul 2024 11:16:09 +0100
Subject: [PATCH 118/259] [LV] Ignore live-out users in cost model if scalar
 epilogue is required.

Follow-up to ba8126b6fef79.

If a scalar epilogue is required, users outside the loop won't use
live-outs from the vector loop but from the scalar epilogue. Ignore them if
that is the case.

This fixes another case where the VPlan-based cost-model more accurately
computes cost.

Fixes https://github.com/llvm/llvm-project/issues/100464.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 24 ++++-
 .../LoopVectorize/RISCV/dead-ops-cost.ll      | 97 +++++++++++++++++++
 .../RISCV/riscv-vector-reverse.ll             |  2 +
 3 files changed, 119 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b709ccc3c35a7..224d98d5c4f99 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6678,6 +6678,15 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
 
   SmallVector<Value *, 4> DeadInterleavePointerOps;
   SmallVector<Value *, 4> DeadOps;
+
+  // If a scalar epilogue is required, users outside the loop won't use
+  // live-outs from the vector loop but from the scalar epilogue. Ignore them if
+  // that is the case.
+  bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
+  auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
+    return RequiresScalarEpilogue &&
+           !TheLoop->contains(cast<Instruction>(U)->getParent());
+  };
   for (BasicBlock *BB : TheLoop->blocks())
     for (Instruction &I : *BB) {
       // Find all stores to invariant variables. Since they are going to sink
@@ -6693,8 +6702,9 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
       // Add instructions that would be trivially dead and are only used by
       // values already ignored to DeadOps to seed worklist.
       if (wouldInstructionBeTriviallyDead(&I, TLI) &&
-          all_of(I.users(), [this](User *U) {
-            return VecValuesToIgnore.contains(U) || ValuesToIgnore.contains(U);
+          all_of(I.users(), [this, IsLiveOutDead](User *U) {
+            return VecValuesToIgnore.contains(U) ||
+                   ValuesToIgnore.contains(U) || IsLiveOutDead(U);
           }))
         DeadOps.push_back(&I);
 
@@ -6727,16 +6737,22 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
 
   // Mark ops that would be trivially dead and are only used by ignored
   // instructions as free.
+  BasicBlock *Header = TheLoop->getHeader();
   for (unsigned I = 0; I != DeadOps.size(); ++I) {
     auto *Op = dyn_cast<Instruction>(DeadOps[I]);
     // Skip any op that shouldn't be considered dead.
     if (!Op || !TheLoop->contains(Op) ||
+        (isa<PHINode>(Op) && Op->getParent() == Header) ||
         !wouldInstructionBeTriviallyDead(Op, TLI) ||
-        any_of(Op->users(), [this](User *U) {
-          return !VecValuesToIgnore.contains(U) && !ValuesToIgnore.contains(U);
+        any_of(Op->users(), [this, IsLiveOutDead](User *U) {
+          return !VecValuesToIgnore.contains(U) && ValuesToIgnore.contains(U) &&
+                 !IsLiveOutDead(U);
         }))
       continue;
 
+    if (!TheLoop->contains(Op->getParent()))
+      continue;
+
     // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
     // which applies for both scalar and vector versions. Otherwise it is only
     // dead in vector versions, so only add it to VecValuesToIgnore.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 9db4108de4b0a..da5db810ec638 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -86,9 +86,106 @@ loop:
 exit:
   ret void
 }
+
+; Test case for https://github.com/llvm/llvm-project/issues/100464.
+; Loop with a live-out %l and scalar epilogue required due to an interleave
+; group. As the scalar epilogue is required the live-out is fed from the scalar
+; epilogue and dead in the vector loop.
+define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
+; CHECK-LABEL: define i8 @dead_live_out_due_to_scalar_epilogue_required(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 8, i32 [[TMP1]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 252, [[TMP2]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 1005
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 1005
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 252, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 252, [[TMP6]]
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[N_VEC]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i32> [[TMP10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 4, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], 4
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 4, [[TMP13]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP14]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i32> [[VEC_IND]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], <vscale x 4 x i64> [[TMP15]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x ptr> [[TMP16]], i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)), !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP8]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IDXPROM]]
+; CHECK-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IV]], 1001
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[R:%.*]] = phi i8 [ [[L]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %idxprom = sext i32 %iv to i64
+  %gep.src = getelementptr i8, ptr %src, i64 %idxprom
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr i8, ptr %dst, i64 %idxprom
+  store i8 0, ptr %gep.dst, align 1
+  %iv.next = add i32 %iv, 4
+  %cmp = icmp ult i32 %iv, 1001
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  %r = phi i8 [ %l, %loop ]
+  ret i8 %r
+}
+
+
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[META4]] = !{[[META5:![0-9]+]]}
+; CHECK: [[META5]] = distinct !{[[META5]], [[META6:![0-9]+]]}
+; CHECK: [[META6]] = distinct !{[[META6]], !"LVerDomain"}
+; CHECK: [[META7]] = !{[[META8:![0-9]+]]}
+; CHECK: [[META8]] = distinct !{[[META8]], [[META6]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index fc310f4163082..3a14842580425 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -18,6 +18,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an induction variable.
 ; CHECK-NEXT:  LV: Did not find one integer induction var.
 ; CHECK-NEXT:  LV: We can vectorize this loop (with a runtime bound check)!
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Found trip count: 0
 ; CHECK-NEXT:  LV: Scalable vectorization is available
 ; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.
@@ -222,6 +223,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found FP op with unsafe algebra.
 ; CHECK-NEXT:  LV: Did not find one integer induction var.
 ; CHECK-NEXT:  LV: We can vectorize this loop (with a runtime bound check)!
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Found trip count: 0
 ; CHECK-NEXT:  LV: Scalable vectorization is available
 ; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.

>From 91073380ac5a0dceebdd09f360a1dc194d7ee93f Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Thu, 25 Jul 2024 12:25:19 +0200
Subject: [PATCH 119/259] [BasicAA] Fix handling of indirect assumption based
 results (#100130)

If a result is potentially based on a not yet proven assumption,
BasicAA will remember it inside AssumptionBasedResults and remove
the cache entry if an assumption higher up is later disproved.
However, we currently miss the case where another cache entry ends
up depending on such an AssumptionBased result.

Fix this by introducing an additional AssumptionBased state for
cache entries. If such a result is used, we'll still increment
AAQI.NumAssumptionUses, which means that the using entry will
also become AssumptionBased and be cleared if the assumption is
disproved.

At the end of the root query, convert remaining AssumptionBased
results into definitive results.

Fixes https://github.com/llvm/llvm-project/issues/98978.
---
 llvm/include/llvm/Analysis/AliasAnalysis.h    |  17 ++-
 llvm/lib/Analysis/BasicAliasAnalysis.cpp      |  28 ++++-
 .../Transforms/SLPVectorizer/X86/pr98978.ll   | 106 ++++++++++++++++++
 3 files changed, 144 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/pr98978.ll

diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h
index 812b5a9f72a3a..4140387a1f341 100644
--- a/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -244,12 +244,23 @@ class AAQueryInfo {
 public:
   using LocPair = std::pair<AACacheLoc, AACacheLoc>;
   struct CacheEntry {
+    /// Cache entry is neither an assumption nor does it use a (non-definitive)
+    /// assumption.
+    static constexpr int Definitive = -2;
+    /// Cache entry is not an assumption itself, but may be using an assumption
+    /// from higher up the stack.
+    static constexpr int AssumptionBased = -1;
+
     AliasResult Result;
-    /// Number of times a NoAlias assumption has been used.
-    /// 0 for assumptions that have not been used, -1 for definitive results.
+    /// Number of times a NoAlias assumption has been used, 0 for assumptions
+    /// that have not been used. Can also take one of the Definitive or
+    /// AssumptionBased values documented above.
     int NumAssumptionUses;
+
     /// Whether this is a definitive (non-assumption) result.
-    bool isDefinitive() const { return NumAssumptionUses < 0; }
+    bool isDefinitive() const { return NumAssumptionUses == Definitive; }
+    /// Whether this is an assumption that has not been proven yet.
+    bool isAssumption() const { return NumAssumptionUses >= 0; }
   };
 
   // Alias analysis result aggregration using which this query is performed.
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 161a3034e4829..e474899fb548e 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1692,9 +1692,12 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
   if (!Pair.second) {
     auto &Entry = Pair.first->second;
     if (!Entry.isDefinitive()) {
-      // Remember that we used an assumption.
-      ++Entry.NumAssumptionUses;
+      // Remember that we used an assumption. This may either be a direct use
+      // of an assumption, or a use of an entry that may itself be based on an
+      // assumption.
       ++AAQI.NumAssumptionUses;
+      if (Entry.isAssumption())
+        ++Entry.NumAssumptionUses;
     }
     // Cache contains sorted {V1,V2} pairs but we should return original order.
     auto Result = Entry.Result;
@@ -1722,7 +1725,6 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
   Entry.Result = Result;
   // Cache contains sorted {V1,V2} pairs.
   Entry.Result.swap(Swapped);
-  Entry.NumAssumptionUses = -1;
 
   // If the assumption has been disproven, remove any results that may have
   // been based on this assumption. Do this after the Entry updates above to
@@ -1734,8 +1736,26 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
   // The result may still be based on assumptions higher up in the chain.
   // Remember it, so it can be purged from the cache later.
   if (OrigNumAssumptionUses != AAQI.NumAssumptionUses &&
-      Result != AliasResult::MayAlias)
+      Result != AliasResult::MayAlias) {
     AAQI.AssumptionBasedResults.push_back(Locs);
+    Entry.NumAssumptionUses = AAQueryInfo::CacheEntry::AssumptionBased;
+  } else {
+    Entry.NumAssumptionUses = AAQueryInfo::CacheEntry::Definitive;
+  }
+
+  // Depth is incremented before this function is called, so Depth==1 indicates
+  // a root query.
+  if (AAQI.Depth == 1) {
+    // Any remaining assumption based results must be based on proven
+    // assumptions, so convert them to definitive results.
+    for (const auto &Loc : AAQI.AssumptionBasedResults) {
+      auto It = AAQI.AliasCache.find(Loc);
+      if (It != AAQI.AliasCache.end())
+        It->second.NumAssumptionUses = AAQueryInfo::CacheEntry::Definitive;
+    }
+    AAQI.AssumptionBasedResults.clear();
+    AAQI.NumAssumptionUses = 0;
+  }
   return Result;
 }
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr98978.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr98978.ll
new file mode 100644
index 0000000000000..429bf13b2b87a
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr98978.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=slp-vectorizer < %s | FileCheck %s
+
+target triple = "x86_64-redhat-linux-gnu"
+
+; The load+store sequence inside bb10 should not get vectorized. Previously,
+; we incorrectly determined that the pointers do not alias, because a cache
+; entry based indirectly on a disproven NoAlias assumption was not cleared
+; from the BatchAA cache.
+define void @test(ptr %p1, i64 %arg1, i64 %arg2) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[P1:%.*]], i64 [[ARG1:%.*]], i64 [[ARG2:%.*]]) {
+; CHECK-NEXT:  [[_PREHEADER48_PREHEADER_1:.*]]:
+; CHECK-NEXT:    br label %[[_LOOPEXIT49_1:.*]]
+; CHECK:       [[_LOOPEXIT49_1]]:
+; CHECK-NEXT:    [[I:%.*]] = phi ptr [ [[I21:%.*]], %[[BB20:.*]] ], [ [[P1]], %[[_PREHEADER48_PREHEADER_1]] ]
+; CHECK-NEXT:    br i1 false, label %[[BB22:.*]], label %[[DOTPREHEADER48_PREHEADER_1:.*]]
+; CHECK:       [[DEAD:.*]]:
+; CHECK-NEXT:    br label %[[DOTPREHEADER48_PREHEADER_1]]
+; CHECK:       [[_PREHEADER48_PREHEADER_2:.*:]]
+; CHECK-NEXT:    [[I5:%.*]] = phi ptr [ [[I]], %[[DEAD]] ], [ [[I]], %[[_LOOPEXIT49_1]] ]
+; CHECK-NEXT:    br label %[[DOTLOOPEXIT49_1:.*]]
+; CHECK:       [[DEAD1:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[DOTLOOPEXIT49_1]], label %[[BB20]]
+; CHECK:       [[_LOOPEXIT49_2:.*:]]
+; CHECK-NEXT:    [[I6:%.*]] = phi ptr [ [[I5]], %[[DEAD1]] ], [ [[I5]], %[[DOTPREHEADER48_PREHEADER_1]] ]
+; CHECK-NEXT:    [[I7:%.*]] = getelementptr i8, ptr [[I6]], i64 [[ARG1]]
+; CHECK-NEXT:    br label %[[BB10:.*]]
+; CHECK:       [[DEAD2:.*]]:
+; CHECK-NEXT:    br label %[[BB10]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[I11:%.*]] = phi ptr [ [[I7]], %[[DOTLOOPEXIT49_1]] ], [ null, %[[DEAD2]] ]
+; CHECK-NEXT:    [[I16:%.*]] = getelementptr i8, ptr [[I11]], i64 8
+; CHECK-NEXT:    [[I17:%.*]] = load i64, ptr [[I16]], align 1
+; CHECK-NEXT:    store i64 [[I17]], ptr [[I6]], align 1
+; CHECK-NEXT:    [[I18:%.*]] = getelementptr i8, ptr [[I6]], i64 8
+; CHECK-NEXT:    [[I19:%.*]] = load i64, ptr [[I11]], align 1
+; CHECK-NEXT:    store i64 [[I19]], ptr [[I18]], align 1
+; CHECK-NEXT:    br label %[[BB20]]
+; CHECK:       [[BB20]]:
+; CHECK-NEXT:    [[I21]] = phi ptr [ [[I5]], %[[DEAD1]] ], [ [[I6]], %[[BB10]] ]
+; CHECK-NEXT:    br label %[[_LOOPEXIT49_1]]
+; CHECK:       [[BB22]]:
+; CHECK-NEXT:    [[I23:%.*]] = getelementptr i8, ptr [[I]], i64 [[ARG2]]
+; CHECK-NEXT:    [[I25:%.*]] = getelementptr i8, ptr [[I23]], i64 8
+; CHECK-NEXT:    br label %[[BB26:.*]]
+; CHECK:       [[BB26]]:
+; CHECK-NEXT:    [[I27:%.*]] = phi ptr [ null, %[[BB26]] ], [ [[I25]], %[[BB22]] ]
+; CHECK-NEXT:    store i64 0, ptr [[I27]], align 1
+; CHECK-NEXT:    [[I28:%.*]] = getelementptr i8, ptr [[I27]], i64 8
+; CHECK-NEXT:    [[I29:%.*]] = load i64, ptr [[I23]], align 1
+; CHECK-NEXT:    store i64 0, ptr [[I28]], align 1
+; CHECK-NEXT:    br label %[[BB26]]
+;
+entry:
+  br label %loop1
+
+loop1:                                            ; preds = %bb20, %entry
+  %i = phi ptr [ %i21, %bb20 ], [ %p1, %entry ]
+  br i1 false, label %bb22, label %.preheader48.preheader.1
+
+dead:                                             ; No predecessors!
+  br label %.preheader48.preheader.1
+
+.preheader48.preheader.1:                         ; preds = %dead, %loop1
+  %i5 = phi ptr [ %i, %dead ], [ %i, %loop1 ]
+  br label %.loopexit49.1
+
+dead1:                                    ; No predecessors!
+  br i1 false, label %.loopexit49.1, label %bb20
+
+.loopexit49.1:                                    ; preds = %dead1, %.preheader48.preheader.1
+  %i6 = phi ptr [ %i5, %dead1 ], [ %i5, %.preheader48.preheader.1 ]
+  %i7 = getelementptr i8, ptr %i6, i64 %arg1
+  br label %bb10
+
+dead2:                                            ; No predecessors!
+  br label %bb10
+
+bb10:                                             ; preds = %dead2, %.loopexit49.1
+  %i11 = phi ptr [ %i7, %.loopexit49.1 ], [ null, %dead2 ]
+  %i16 = getelementptr i8, ptr %i11, i64 8
+  %i17 = load i64, ptr %i16, align 1
+  store i64 %i17, ptr %i6, align 1
+  %i18 = getelementptr i8, ptr %i6, i64 8
+  %i19 = load i64, ptr %i11, align 1
+  store i64 %i19, ptr %i18, align 1
+  br label %bb20
+
+bb20:                                             ; preds = %bb10, %dead1
+  %i21 = phi ptr [ %i5, %dead1 ], [ %i6, %bb10 ]
+  br label %loop1
+
+bb22:                                             ; preds = %loop1
+  %i23 = getelementptr i8, ptr %i, i64 %arg2
+  %i25 = getelementptr i8, ptr %i23, i64 8
+  br label %bb26
+
+bb26:                                             ; preds = %bb26, %bb22
+  %i27 = phi ptr [ null, %bb26 ], [ %i25, %bb22 ]
+  store i64 0, ptr %i27, align 1
+  %i28 = getelementptr i8, ptr %i27, i64 8
+  %i29 = load i64, ptr %i23, align 1
+  store i64 0, ptr %i28, align 1
+  br label %bb26
+}

>From ca69444cef0858ad4facecbfc2232a02422aca9f Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg at gmail.com>
Date: Thu, 25 Jul 2024 16:33:05 +0530
Subject: [PATCH 120/259] [Clang] Fix a variable shadowing in MapLattice (NFC)
 (#95697)

Reported in https://pvs-studio.com/en/blog/posts/cpp/1126/, fragment
N10.

The PVS-Studio warning:
V570 The 'C' variable is assigned to itself. MapLattice.h:52
---
 clang/include/clang/Analysis/FlowSensitive/MapLattice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/Analysis/FlowSensitive/MapLattice.h b/clang/include/clang/Analysis/FlowSensitive/MapLattice.h
index 16b0c978779a7..b2d147e4ae444 100644
--- a/clang/include/clang/Analysis/FlowSensitive/MapLattice.h
+++ b/clang/include/clang/Analysis/FlowSensitive/MapLattice.h
@@ -49,7 +49,7 @@ template <typename Key, typename ElementLattice> class MapLattice {
 
   MapLattice() = default;
 
-  explicit MapLattice(Container C) { C = std::move(C); }
+  explicit MapLattice(Container C) : C{std::move(C)} {};
 
   // The `bottom` element is the empty map.
   static MapLattice bottom() { return MapLattice(); }

>From 90a997988279ecd0e9aefcf14ee0642093b03398 Mon Sep 17 00:00:00 2001
From: Kai Yan <aklkaiyan at tencent.com>
Date: Thu, 25 Jul 2024 19:16:23 +0800
Subject: [PATCH 121/259] [llvm][CodeGen] Fixed a bug in stall cycle
 calculation for window scheduler (#99451)

Fixed a bug in stall cycle calculation.
When a register defined by an instruction in the current iteration is
used by an instruction in the next iteration, we have modified the
number of stall cycle that need to be inserted.
---
 llvm/lib/CodeGen/WindowScheduler.cpp          |  7 ++-
 .../CodeGen/Hexagon/swp-ws-stall-cycle.mir    | 59 +++++++++++++++++++
 2 files changed, 63 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/Hexagon/swp-ws-stall-cycle.mir

diff --git a/llvm/lib/CodeGen/WindowScheduler.cpp b/llvm/lib/CodeGen/WindowScheduler.cpp
index cb878f7425481..6a2c6537470db 100644
--- a/llvm/lib/CodeGen/WindowScheduler.cpp
+++ b/llvm/lib/CodeGen/WindowScheduler.cpp
@@ -492,6 +492,7 @@ int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG,
 // ========================================
 int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
   int MaxStallCycle = 0;
+  int CurrentII = MaxCycle + 1;
   auto Range = getScheduleRange(Offset, SchedInstrNum);
   for (auto &MI : Range) {
     auto *SU = TripleDAG->getSUnit(&MI);
@@ -499,8 +500,8 @@ int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
     for (auto &Succ : SU->Succs) {
       if (Succ.isWeak() || Succ.getSUnit() == &TripleDAG->ExitSU)
         continue;
-      // If the expected cycle does not exceed MaxCycle, no check is needed.
-      if (DefCycle + (int)Succ.getLatency() <= MaxCycle)
+      // If the expected cycle does not exceed CurrentII, no check is needed.
+      if (DefCycle + (int)Succ.getLatency() <= CurrentII)
         continue;
       // If the cycle of the scheduled MI A is less than that of the scheduled
       // MI B, the scheduling will fail because the lifetime of the
@@ -510,7 +511,7 @@ int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
       if (DefCycle < UseCycle)
         return WindowIILimit;
       // Get the stall cycle introduced by the register between two trips.
-      int StallCycle = DefCycle + (int)Succ.getLatency() - MaxCycle - UseCycle;
+      int StallCycle = DefCycle + (int)Succ.getLatency() - CurrentII - UseCycle;
       MaxStallCycle = std::max(MaxStallCycle, StallCycle);
     }
   }
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-stall-cycle.mir b/llvm/test/CodeGen/Hexagon/swp-ws-stall-cycle.mir
new file mode 100644
index 0000000000000..ddba67d78eb58
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-stall-cycle.mir
@@ -0,0 +1,59 @@
+# REQUIRES: asserts
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs \
+# RUN: -window-region-limit=1 -window-search-ratio=100 -window-diff-limit=0 \
+# RUN: 2>&1 | FileCheck %s
+
+# CHECK-LABEL: Start analyzing II
+# CHECK: MaxStallCycle is 0
+# CHECK-LABEL: Start analyzing II
+# CHECK: MaxStallCycle is 0
+# CHECK-LABEL: Start analyzing II
+# CHECK: MaxStallCycle is 0
+
+---
+name:            test_window_stall_cycle
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.3(0x40000000), %bb.1(0x40000000)
+    liveins: $r0, $r1
+  
+    %0:intregs = COPY $r1
+    %1:intregs = COPY $r0
+    %2:intregs = nsw A2_add %0, %1
+    %3:intregs = S2_lsr_i_r_acc %2, %2, 31
+    %4:intregs = S2_asr_i_r killed %3, 1
+    %5:predregs = C2_cmpgt %1, %4
+    %6:intregs = A2_tfrsi 0
+    J2_jumpt killed %5, %bb.3, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1:
+    successors: %bb.2(0x80000000)
+  
+    %7:intregs = A2_addi %4, 2
+    %8:intregs = A2_tfrsi 0
+    %9:intregs = A2_sub %4, %1
+    %10:intregs = A2_addi %9, 1
+    %11:intregs = COPY %10
+    J2_loop0r %bb.2, %11, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+  
+  bb.2 (machine-block-address-taken):
+    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+  
+    %12:intregs = PHI %7, %bb.1, %13, %bb.2
+    %14:intregs = PHI %8, %bb.1, %15, %bb.2
+    %16:intregs = PHI %8, %bb.1, %17, %bb.2
+    %18:intregs, %13:intregs = L2_loadri_pi %12, -4
+    %17:intregs = nsw A2_add killed %18, %16
+    %15:intregs = A2_max %17, %14
+    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.3:
+    %19:intregs = PHI %6, %bb.0, %15, %bb.2
+    $r0 = COPY %19
+    PS_jmpret $r31, implicit-def dead $pc, implicit $r0
+
+...

>From 72532c921989990180b11d46e38a92874008f51a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 25 Jul 2024 12:21:09 +0100
Subject: [PATCH 122/259] [LV] Don't predicate divs with invariant divisor when
 folding tail (#98904)

When folding the tail, at least one of the lanes must execute
unconditionally. If the divisor is loop-invariant no predication is
needed, as predication would not prevent the divide-by-0 on the executed
lane.

Depends on https://github.com/llvm/llvm-project/pull/98892.

PR: https://github.com/llvm/llvm-project/pull/98904
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  63 ++++----
 .../AArch64/divs-with-scalable-vfs.ll         |  56 +++----
 .../AArch64/reduction-recurrence-costs-sve.ll |  35 ++---
 .../X86/divs-with-tail-folding.ll             | 147 +-----------------
 ...-order-recurrence-sink-replicate-region.ll |  47 ++----
 5 files changed, 95 insertions(+), 253 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 224d98d5c4f99..cab892146306c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3339,45 +3339,54 @@ bool LoopVectorizationCostModel::isScalarWithPredication(
   }
 }
 
+// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
-  if (!blockNeedsPredicationForAnyReason(I->getParent()))
+  // If predication is not needed, avoid it.
+  // TODO: We can use the loop-preheader as context point here and get
+  // context sensitive reasoning for isSafeToSpeculativelyExecute.
+  if (!blockNeedsPredicationForAnyReason(I->getParent()) ||
+      isSafeToSpeculativelyExecute(I) ||
+      (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
+      isa<BranchInst, PHINode>(I))
     return false;
 
-  // Can we prove this instruction is safe to unconditionally execute?
-  // If not, we must use some form of predication.
+  // If the instruction was executed conditionally in the original scalar loop,
+  // predication is needed with a mask whose lanes are all possibly inactive.
+  if (Legal->blockNeedsPredication(I->getParent()))
+    return true;
+
+  // All that remain are instructions with side-effects originally executed in
+  // the loop unconditionally, but now execute under a tail-fold mask (only)
+  // having at least one active lane (the first). If the side-effects of the
+  // instruction are invariant, executing it w/o (the tail-folding) mask is safe
+  // - it will cause the same side-effects as when masked.
   switch(I->getOpcode()) {
   default:
-    return false;
+    llvm_unreachable(
+        "instruction should have been considered by earlier checks");
+  case Instruction::Call:
+    // Side-effects of a Call are assumed to be non-invariant, needing a
+    // (fold-tail) mask.
+    assert(Legal->isMaskRequired(I) &&
+           "should have returned earlier for calls not needing a mask");
+    return true;
   case Instruction::Load:
+    // If the address is loop invariant no predication is needed.
+    return !Legal->isInvariant(getLoadStorePointerOperand(I));
   case Instruction::Store: {
-    if (!Legal->isMaskRequired(I))
-      return false;
-    // When we know the load's address is loop invariant and the instruction
-    // in the original scalar loop was unconditionally executed then we
-    // don't need to mark it as a predicated instruction. Tail folding may
-    // introduce additional predication, but we're guaranteed to always have
-    // at least one active lane.  We call Legal->blockNeedsPredication here
-    // because it doesn't query tail-folding.  For stores, we need to prove
-    // both speculation safety (which follows from the same argument as loads),
-    // but also must prove the value being stored is correct.  The easiest
-    // form of the later is to require that all values stored are the same.
-    if (Legal->isInvariant(getLoadStorePointerOperand(I)) &&
-        (isa<LoadInst>(I) ||
-         (isa<StoreInst>(I) &&
-          TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
-        !Legal->blockNeedsPredication(I->getParent()))
-      return false;
-    return true;
+    // For stores, we need to prove both speculation safety (which follows from
+    // the same argument as loads), but also must prove the value being stored
+    // is correct.  The easiest form of the later is to require that all values
+    // stored are the same.
+    return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
+             TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
   }
   case Instruction::UDiv:
   case Instruction::SDiv:
   case Instruction::SRem:
   case Instruction::URem:
-    // TODO: We can use the loop-preheader as context point here and get
-    // context sensitive reasoning
-    return !isSafeToSpeculativelyExecute(I);
-  case Instruction::Call:
-    return Legal->isMaskRequired(I);
+    // If the divisor is loop-invariant no predication is needed.
+    return !TheLoop->isLoopInvariant(I->getOperand(1));
   }
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
index a6c07aaaabc64..bce2d6c14d866 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -274,50 +274,38 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 2
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 1, [[TMP19]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP20]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[MUL_2_I]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP20]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[MUL_1_I]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[MUL_2_I]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP22:%.*]] = udiv <vscale x 2 x i64> [[VEC_IND]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP24:%.*]] = urem <vscale x 2 x i64> [[VEC_IND]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP26:%.*]] = udiv <vscale x 2 x i64> [[TMP24]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP28:%.*]] = urem <vscale x 2 x i64> [[TMP24]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT4]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP30:%.*]] = udiv <vscale x 2 x i64> [[TMP28]], [[TMP29]]
-; CHECK-NEXT:    [[TMP31:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT4]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP32:%.*]] = urem <vscale x 2 x i64> [[TMP28]], [[TMP31]]
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <vscale x 2 x i64> [[TMP22]], i32 0
-; CHECK-NEXT:    [[TMP34:%.*]] = mul i64 [[X]], [[TMP33]]
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <vscale x 2 x i64> [[TMP26]], i32 0
-; CHECK-NEXT:    [[TMP36:%.*]] = add i64 [[TMP34]], [[TMP35]]
-; CHECK-NEXT:    [[TMP37:%.*]] = mul i64 [[TMP36]], [[X]]
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <vscale x 2 x i64> [[TMP30]], i32 0
-; CHECK-NEXT:    [[TMP39:%.*]] = add i64 [[TMP37]], [[TMP38]]
-; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], [[X]]
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <vscale x 2 x i64> [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP42:%.*]] = add i64 [[TMP40]], [[TMP41]]
-; CHECK-NEXT:    [[TMP43:%.*]] = shl i64 [[TMP42]], 32
-; CHECK-NEXT:    [[TMP44:%.*]] = ashr i64 [[TMP43]], 32
-; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP44]]
-; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr i64, ptr [[TMP45]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP22]], ptr [[TMP46]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = udiv <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT:    [[TMP24:%.*]] = urem i64 [[TMP21]], [[MUL_2_I]]
+; CHECK-NEXT:    [[TMP25:%.*]] = udiv i64 [[TMP24]], [[MUL_1_I]]
+; CHECK-NEXT:    [[TMP26:%.*]] = urem i64 [[TMP24]], [[MUL_1_I]]
+; CHECK-NEXT:    [[TMP27:%.*]] = udiv i64 [[TMP26]], [[X]]
+; CHECK-NEXT:    [[TMP28:%.*]] = urem i64 [[TMP26]], [[X]]
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 2 x i64> [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP30:%.*]] = mul i64 [[X]], [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[TMP30]], [[TMP25]]
+; CHECK-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], [[X]]
+; CHECK-NEXT:    [[TMP33:%.*]] = add i64 [[TMP32]], [[TMP27]]
+; CHECK-NEXT:    [[TMP34:%.*]] = mul i64 [[TMP33]], [[X]]
+; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP34]], [[TMP28]]
+; CHECK-NEXT:    [[TMP36:%.*]] = shl i64 [[TMP35]], 32
+; CHECK-NEXT:    [[TMP37:%.*]] = ashr i64 [[TMP36]], 32
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr i64, ptr [[TMP38]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP23]], ptr [[TMP39]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP14]])
 ; CHECK-NEXT:    [[TMP47:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <vscale x 2 x i1> [[TMP47]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index 2b2c224dbefe0..95588d1176dcd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -380,41 +380,40 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
 ; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; PRED:       vector.ph:
 ; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
 ; PRED-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP2]], 1
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP5]]
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
 ; PRED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; PRED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; PRED-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP0]], [[TMP9]]
 ; PRED-NEXT:    [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], [[TMP9]]
 ; PRED-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[X]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[X]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
 ; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PRED:       vector.body:
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PRED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 0
 ; PRED-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP13]]
 ; PRED-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr [[TMP14]], i32 0
-; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP15]], i32 2, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> poison)
-; PRED-NEXT:    [[TMP16:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> [[BROADCAST_SPLAT]], <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
-; PRED-NEXT:    [[TMP17:%.*]] = udiv <vscale x 8 x i16> [[WIDE_MASKED_LOAD]], [[TMP16]]
-; PRED-NEXT:    [[TMP18:%.*]] = or <vscale x 8 x i16> [[TMP17]], [[VEC_PHI]]
-; PRED-NEXT:    [[TMP19]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> [[TMP18]], <vscale x 8 x i16> [[VEC_PHI]]
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP15]], i32 2, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i16> poison)
+; PRED-NEXT:    [[TMP19:%.*]] = udiv <vscale x 4 x i16> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
+; PRED-NEXT:    [[TMP20:%.*]] = or <vscale x 4 x i16> [[TMP19]], [[VEC_PHI]]
+; PRED-NEXT:    [[TMP16]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i16> [[TMP20]], <vscale x 4 x i16> [[VEC_PHI]]
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP12]])
-; PRED-NEXT:    [[TMP20:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; PRED-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 8 x i1> [[TMP20]], i32 0
-; PRED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]])
+; PRED-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
+; PRED-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PRED:       middle.block:
-; PRED-NEXT:    [[TMP22:%.*]] = call i16 @llvm.vector.reduce.or.nxv8i16(<vscale x 8 x i16> [[TMP19]])
+; PRED-NEXT:    [[TMP22:%.*]] = call i16 @llvm.vector.reduce.or.nxv4i16(<vscale x 4 x i16> [[TMP16]])
 ; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; PRED:       scalar.ph:
 ; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
index 133510fbb2db8..32964d650ea0f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
@@ -23,42 +23,15 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_SDIV_CONTINUE8:.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[PRED_SDIV_IF:.*]], label %[[PRED_SDIV_CONTINUE:.*]]
-; CHECK:       [[PRED_SDIV_IF]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = sdiv i64 [[M]], [[CONV6]]
-; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE]]
-; CHECK:       [[PRED_SDIV_CONTINUE]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP8]], %[[PRED_SDIV_IF]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP6]], i32 1
-; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_SDIV_IF3:.*]], label %[[PRED_SDIV_CONTINUE4:.*]]
-; CHECK:       [[PRED_SDIV_IF3]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = sdiv i64 [[M]], [[CONV6]]
-; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE4]]
-; CHECK:       [[PRED_SDIV_CONTINUE4]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi i64 [ poison, %[[PRED_SDIV_CONTINUE]] ], [ [[TMP11]], %[[PRED_SDIV_IF3]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP6]], i32 2
-; CHECK-NEXT:    br i1 [[TMP13]], label %[[PRED_SDIV_IF5:.*]], label %[[PRED_SDIV_CONTINUE6:.*]]
-; CHECK:       [[PRED_SDIV_IF5]]:
-; CHECK-NEXT:    [[TMP14:%.*]] = sdiv i64 [[M]], [[CONV6]]
-; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE6]]
-; CHECK:       [[PRED_SDIV_CONTINUE6]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi i64 [ poison, %[[PRED_SDIV_CONTINUE4]] ], [ [[TMP14]], %[[PRED_SDIV_IF5]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP6]], i32 3
-; CHECK-NEXT:    br i1 [[TMP16]], label %[[PRED_SDIV_IF7:.*]], label %[[PRED_SDIV_CONTINUE8]]
-; CHECK:       [[PRED_SDIV_IF7]]:
 ; CHECK-NEXT:    [[TMP17:%.*]] = sdiv i64 [[M]], [[CONV6]]
-; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE8]]
-; CHECK:       [[PRED_SDIV_CONTINUE8]]:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi i64 [ poison, %[[PRED_SDIV_CONTINUE6]] ], [ [[TMP17]], %[[PRED_SDIV_IF7]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP9]] to i32
-; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP9]], [[CONV61]]
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP17]] to i32
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP17]], [[CONV61]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = sub i64 [[TMP5]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = trunc i64 [[TMP21]] to i32
 ; CHECK-NEXT:    [[TMP23:%.*]] = mul i32 [[X]], [[TMP19]]
@@ -254,117 +227,9 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[MUL_1_I:%.*]] = mul i64 [[X]], [[X]]
 ; CHECK-NEXT:    [[MUL_2_I:%.*]] = mul i64 [[MUL_1_I]], [[X]]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
-; CHECK:       [[VECTOR_SCEVCHECK]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[N]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[N]], 4294967295
-; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE6:.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_UREM_CONTINUE6]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_UREM_IF:.*]], label %[[PRED_UREM_CONTINUE:.*]]
-; CHECK:       [[PRED_UREM_IF]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = udiv i64 [[TMP7]], [[MUL_2_I]]
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = urem i64 [[TMP7]], [[MUL_2_I]]
-; CHECK-NEXT:    [[TMP11:%.*]] = udiv i64 [[TMP10]], [[MUL_1_I]]
-; CHECK-NEXT:    [[TMP12:%.*]] = urem i64 [[TMP10]], [[MUL_1_I]]
-; CHECK-NEXT:    [[TMP13:%.*]] = udiv i64 [[TMP12]], [[X]]
-; CHECK-NEXT:    [[TMP14:%.*]] = urem i64 [[TMP12]], [[X]]
-; CHECK-NEXT:    br label %[[PRED_UREM_CONTINUE]]
-; CHECK:       [[PRED_UREM_CONTINUE]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i64> [ poison, %[[VECTOR_BODY]] ], [ [[TMP9]], %[[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP17:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP11]], %[[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP13]], %[[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[TMP14]], %[[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
-; CHECK-NEXT:    br i1 [[TMP21]], label %[[PRED_UREM_IF1:.*]], label %[[PRED_UREM_CONTINUE2:.*]]
-; CHECK:       [[PRED_UREM_IF1]]:
-; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP23:%.*]] = udiv i64 [[TMP22]], [[MUL_2_I]]
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP23]], i32 1
-; CHECK-NEXT:    [[TMP25:%.*]] = urem i64 [[TMP22]], [[MUL_2_I]]
-; CHECK-NEXT:    [[TMP26:%.*]] = udiv i64 [[TMP25]], [[MUL_1_I]]
-; CHECK-NEXT:    [[TMP27:%.*]] = urem i64 [[TMP25]], [[MUL_1_I]]
-; CHECK-NEXT:    [[TMP28:%.*]] = udiv i64 [[TMP27]], [[X]]
-; CHECK-NEXT:    [[TMP29:%.*]] = urem i64 [[TMP27]], [[X]]
-; CHECK-NEXT:    br label %[[PRED_UREM_CONTINUE2]]
-; CHECK:       [[PRED_UREM_CONTINUE2]]:
-; CHECK-NEXT:    [[TMP30:%.*]] = phi <4 x i64> [ [[TMP15]], %[[PRED_UREM_CONTINUE]] ], [ [[TMP24]], %[[PRED_UREM_IF1]] ]
-; CHECK-NEXT:    [[TMP32:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE]] ], [ [[TMP26]], %[[PRED_UREM_IF1]] ]
-; CHECK-NEXT:    [[TMP34:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE]] ], [ [[TMP28]], %[[PRED_UREM_IF1]] ]
-; CHECK-NEXT:    [[TMP35:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE]] ], [ [[TMP29]], %[[PRED_UREM_IF1]] ]
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
-; CHECK-NEXT:    br i1 [[TMP36]], label %[[PRED_UREM_IF3:.*]], label %[[PRED_UREM_CONTINUE4:.*]]
-; CHECK:       [[PRED_UREM_IF3]]:
-; CHECK-NEXT:    [[TMP37:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP38:%.*]] = udiv i64 [[TMP37]], [[MUL_2_I]]
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i64> [[TMP30]], i64 [[TMP38]], i32 2
-; CHECK-NEXT:    [[TMP40:%.*]] = urem i64 [[TMP37]], [[MUL_2_I]]
-; CHECK-NEXT:    [[TMP41:%.*]] = udiv i64 [[TMP40]], [[MUL_1_I]]
-; CHECK-NEXT:    [[TMP42:%.*]] = urem i64 [[TMP40]], [[MUL_1_I]]
-; CHECK-NEXT:    [[TMP43:%.*]] = udiv i64 [[TMP42]], [[X]]
-; CHECK-NEXT:    [[TMP44:%.*]] = urem i64 [[TMP42]], [[X]]
-; CHECK-NEXT:    br label %[[PRED_UREM_CONTINUE4]]
-; CHECK:       [[PRED_UREM_CONTINUE4]]:
-; CHECK-NEXT:    [[TMP45:%.*]] = phi <4 x i64> [ [[TMP30]], %[[PRED_UREM_CONTINUE2]] ], [ [[TMP39]], %[[PRED_UREM_IF3]] ]
-; CHECK-NEXT:    [[TMP47:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE2]] ], [ [[TMP41]], %[[PRED_UREM_IF3]] ]
-; CHECK-NEXT:    [[TMP49:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE2]] ], [ [[TMP43]], %[[PRED_UREM_IF3]] ]
-; CHECK-NEXT:    [[TMP50:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE2]] ], [ [[TMP44]], %[[PRED_UREM_IF3]] ]
-; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
-; CHECK-NEXT:    br i1 [[TMP51]], label %[[PRED_UREM_IF5:.*]], label %[[PRED_UREM_CONTINUE6]]
-; CHECK:       [[PRED_UREM_IF5]]:
-; CHECK-NEXT:    [[TMP52:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP53:%.*]] = udiv i64 [[TMP52]], [[MUL_2_I]]
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i64> [[TMP45]], i64 [[TMP53]], i32 3
-; CHECK-NEXT:    [[TMP55:%.*]] = urem i64 [[TMP52]], [[MUL_2_I]]
-; CHECK-NEXT:    [[TMP56:%.*]] = udiv i64 [[TMP55]], [[MUL_1_I]]
-; CHECK-NEXT:    [[TMP57:%.*]] = urem i64 [[TMP55]], [[MUL_1_I]]
-; CHECK-NEXT:    [[TMP58:%.*]] = udiv i64 [[TMP57]], [[X]]
-; CHECK-NEXT:    [[TMP59:%.*]] = urem i64 [[TMP57]], [[X]]
-; CHECK-NEXT:    br label %[[PRED_UREM_CONTINUE6]]
-; CHECK:       [[PRED_UREM_CONTINUE6]]:
-; CHECK-NEXT:    [[TMP60:%.*]] = phi <4 x i64> [ [[TMP45]], %[[PRED_UREM_CONTINUE4]] ], [ [[TMP54]], %[[PRED_UREM_IF5]] ]
-; CHECK-NEXT:    [[TMP62:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE4]] ], [ [[TMP56]], %[[PRED_UREM_IF5]] ]
-; CHECK-NEXT:    [[TMP64:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE4]] ], [ [[TMP58]], %[[PRED_UREM_IF5]] ]
-; CHECK-NEXT:    [[TMP65:%.*]] = phi i64 [ poison, %[[PRED_UREM_CONTINUE4]] ], [ [[TMP59]], %[[PRED_UREM_IF5]] ]
-; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <4 x i64> [[TMP60]], i32 0
-; CHECK-NEXT:    [[TMP67:%.*]] = mul i64 [[X]], [[TMP66]]
-; CHECK-NEXT:    [[TMP68:%.*]] = add i64 [[TMP67]], [[TMP17]]
-; CHECK-NEXT:    [[TMP69:%.*]] = mul i64 [[TMP68]], [[X]]
-; CHECK-NEXT:    [[TMP70:%.*]] = add i64 [[TMP69]], [[TMP19]]
-; CHECK-NEXT:    [[TMP71:%.*]] = mul i64 [[TMP70]], [[X]]
-; CHECK-NEXT:    [[TMP72:%.*]] = add i64 [[TMP71]], [[TMP20]]
-; CHECK-NEXT:    [[TMP73:%.*]] = shl i64 [[TMP72]], 32
-; CHECK-NEXT:    [[TMP74:%.*]] = ashr i64 [[TMP73]], 32
-; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP74]]
-; CHECK-NEXT:    [[TMP76:%.*]] = getelementptr i64, ptr [[TMP75]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP60]], ptr [[TMP76]], i32 4, <4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP77:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP77]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[DIV_I:%.*]] = udiv i64 [[IV]], [[MUL_2_I]]
 ; CHECK-NEXT:    [[REM_I:%.*]] = urem i64 [[IV]], [[MUL_2_I]]
 ; CHECK-NEXT:    [[DIV_1_I:%.*]] = udiv i64 [[REM_I]], [[MUL_1_I]]
@@ -383,7 +248,7 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
 ; CHECK-NEXT:    store i64 [[DIV_I]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -424,6 +289,4 @@ exit:
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
 ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 048b670f0fec8..06fbeafba31c0 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -62,9 +62,9 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize
 ; CHECK-NEXT:   pred.store.continue:
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): loop.2
+; CHECK-NEXT: Successor(s): loop.1
 ; CHECK-EMPTY:
-; CHECK-NEXT: loop.2:
+; CHECK-NEXT: loop.1:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -134,8 +134,8 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT:  Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  pred.store.if:
-; CHECK-NEXT:     REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:     REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:     REPLICATE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEPS]]>
 ; CHECK-NEXT:     REPLICATE ir<%add> = add ir<%rem>, ir<%recur.next>
 ; CHECK-NEXT:     REPLICATE store ir<%add>, ir<%gep>
@@ -144,9 +144,9 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT:   pred.store.continue:
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): loop.1
+; CHECK-NEXT: Successor(s): loop.0
 ; CHECK-EMPTY:
-; CHECK-NEXT: loop.1:
+; CHECK-NEXT: loop.0:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -207,25 +207,8 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT:   EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDEN_CAN]]>, vp<[[BTC]]>
 ; CHECK-NEXT:   WIDEN-CAST ir<%recur.next> = sext ir<%y> to i32
 ; CHECK-NEXT:   EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%recur>, ir<%recur.next>
-; CHECK-NEXT: Successor(s): pred.srem
-; CHECK-EMPTY:
-; CHECK-NEXT: <xVFxUF> pred.srem: {
-; CHECK-NEXT:   pred.srem.entry:
-; CHECK-NEXT:     BRANCH-ON-MASK vp<[[MASK]]>
-; CHECK-NEXT:   Successor(s): pred.srem.if, pred.srem.continue
-; CHECK-EMPTY:
-; CHECK-NEXT:   pred.srem.if:
-; CHECK-NEXT:     REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> (S->V)
-; CHECK-NEXT:   Successor(s): pred.srem.continue
-; CHECK-EMPTY:
-; CHECK-NEXT:   pred.srem.continue:
-; CHECK-NEXT:     PHI-PREDICATED-INSTRUCTION vp<[[PRED:%.+]]> = ir<%rem>
-; CHECK-NEXT:   No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): loop.0
-; CHECK-EMPTY:
-; CHECK-NEXT: loop.0:
-; CHECK-NEXT:   WIDEN ir<%add> = add vp<[[PRED]]>, ir<%recur.next>
+; CHECK-NEXT:   WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
+; CHECK-NEXT:   WIDEN ir<%add> = add ir<%rem>, ir<%recur.next>
 ; CHECK-NEXT:   WIDEN ir<%and.red.next> = and ir<%and.red>, ir<%add>
 ; CHECK-NEXT:   EMIT vp<[[SEL:%.+]]> = select vp<[[MASK]]>, ir<%and.red.next>, ir<%and.red>
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]>
@@ -320,8 +303,8 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr
 ; CHECK-NEXT:   Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK:        pred.store.if:
-; CHECK-NEXT:     REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:     REPLICATE ir<%lv.2> = load ir<%gep>
+; CHECK-NEXT:     REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:     REPLICATE ir<%conv.lv.2> = sext ir<%lv.2>
 ; CHECK-NEXT:     REPLICATE ir<%add.1> = add ir<%conv>, ir<%rem>
 ; CHECK-NEXT:     REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[STEPS]]>
@@ -332,9 +315,9 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr
 ; CHECK:        pred.store.continue:
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
-; CHECK-NEXT:   Successor(s): loop.3
+; CHECK-NEXT:   Successor(s): loop.2
 ; CHECK-EMPTY:
-; CHECK:      loop.3:
+; CHECK:      loop.2:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -424,9 +407,9 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias
 ; CHECK-NEXT:   pred.store.continue:
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): loop.3
+; CHECK-NEXT: Successor(s): loop.2
 ; CHECK-EMPTY:
-; CHECK-NEXT: loop.3:
+; CHECK-NEXT: loop.2:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -497,18 +480,18 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias
 ; CHECK-NEXT:     Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:     pred.store.if:
-; CHECK-NEXT:       REPLICATE ir<%val> = sdiv vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:       vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<1>
 ; CHECK-NEXT:       REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[SCALAR_STEPS]]>
+; CHECK-NEXT:       REPLICATE ir<%val> = sdiv vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:       REPLICATE store ir<%val>, ir<%gep.dst>
 ; CHECK-NEXT:     Successor(s): pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:     pred.store.continue:
 ; CHECK-NEXT:     No successors
 ; CHECK-NEXT:   }
-; CHECK-NEXT:   Successor(s): loop.1
+; CHECK-NEXT:   Successor(s): loop.0
 ; CHECK-EMPTY:
-; CHECK-NEXT:   loop.1:
+; CHECK-NEXT:   loop.0:
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT:%.+]]> = add vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:   No successors

>From f8006a5932b1ccdf3a1eed7b20b5cb608c0a020c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krist=C3=B3f=20Umann?= <dkszelethus at gmail.com>
Date: Thu, 25 Jul 2024 13:26:59 +0200
Subject: [PATCH 123/259] [analyzer][NFC] Add some docs for LazyCompoundValue
 (#97407)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Yes, I basically copy-pasted some posts from discord and Artem's book,
but these make for a rather decent docs.

---------

Co-authored-by: Artem Dergachev <noqnoqneo at gmail.com>
Co-authored-by: Donát Nagy <donat.nagy at ericsson.com>
Co-authored-by: Balazs Benics <benicsbalazs at gmail.com>
---
 .../StaticAnalyzer/Core/PathSensitive/SVals.h | 46 +++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h
index 3a4b087257149..def2970d448d4 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h
@@ -326,6 +326,10 @@ class LocAsInteger : public NonLoc {
   static bool classof(SVal V) { return V.getKind() == LocAsIntegerKind; }
 };
 
+/// The simplest example of a concrete compound value is nonloc::CompoundVal,
+/// which represents a concrete r-value of an initializer-list or a string.
+/// Internally, it contains an llvm::ImmutableList of SVal's stored inside the
+/// literal.
 class CompoundVal : public NonLoc {
   friend class ento::SValBuilder;
 
@@ -346,6 +350,36 @@ class CompoundVal : public NonLoc {
   static bool classof(SVal V) { return V.getKind() == CompoundValKind; }
 };
 
+/// While nonloc::CompoundVal covers a few simple use cases,
+/// nonloc::LazyCompoundVal is a more performant and flexible way to represent
+/// an rvalue of record type, so it shows up much more frequently during
+/// analysis. This value is an r-value that represents a snapshot of any
+/// structure "as a whole" at a given moment during the analysis. Such value is
+/// already quite far from being referred to as "concrete", as many fields
+/// inside it would be unknown or symbolic. nonloc::LazyCompoundVal operates by
+/// storing two things:
+///   * a reference to the TypedValueRegion being snapshotted (yes, it is always
+///     typed), and also
+///   * a reference to the whole Store object, obtained from the ProgramState in
+///     which the nonloc::LazyCompoundVal was created.
+///
+/// Note that the old ProgramState and its Store is kept alive during the
+/// analysis because these are immutable functional data structures and each new
+/// Store value is represented as "earlier Store" + "additional binding".
+///
+/// Essentially, nonloc::LazyCompoundVal is a performance optimization for the
+/// analyzer. Because Store is immutable, creating a nonloc::LazyCompoundVal is
+/// a very cheap operation. Note that the Store contains all region bindings in
+/// the program state, not only related to the region. Later, if necessary, such
+/// value can be unpacked -- eg. when it is assigned to another variable.
+///
+/// If you ever need to inspect the contents of the LazyCompoundVal, you can use
+/// StoreManager::iterBindings(). It'll iterate through all values in the Store,
+/// but you're only interested in the ones that belong to
+/// LazyCompoundVal::getRegion(); other bindings are immaterial.
+///
+/// NOTE: LazyCompoundVal::getRegion() itself is also immaterial (see the actual
+/// method docs for details).
 class LazyCompoundVal : public NonLoc {
   friend class ento::SValBuilder;
 
@@ -363,6 +397,18 @@ class LazyCompoundVal : public NonLoc {
   /// It might return null.
   const void *getStore() const;
 
+  /// This function itself is immaterial. It is only an implementation detail.
+  /// LazyCompoundVal represents only the rvalue, the data (known or unknown)
+  /// that *was* stored in that region *at some point in the past*. The region
+  /// should not be used for any purpose other than figuring out what part of
+  /// the frozen Store you're interested in. The value does not represent the
+  /// *current* value of that region. Sometimes it may, but this should not be
+  /// relied upon. Instead, if you want to figure out what region it represents,
+  /// you typically need to see where you got it from in the first place. The
+  /// region is absolutely not analogous to the C++ "this" pointer. It is also
+  /// not a valid way to "materialize" the prvalue into a glvalue in C++,
+  /// because the region represents the *old* storage (sometimes very old), not
+  /// the *future* storage.
   LLVM_ATTRIBUTE_RETURNS_NONNULL
   const TypedValueRegion *getRegion() const;
 

>From 2ba3fe7356f065757a2279f65e4ef5c8f1476293 Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg at gmail.com>
Date: Thu, 25 Jul 2024 16:59:39 +0530
Subject: [PATCH 124/259] [lldb] Fix incorrect uses of logical operator in 'if'
 condition check (NFC) (#94779)

The condition checking for missing class name, interpreter dictionary,
and script object incorrectly used logical AND (&&), which could never
be true to enter the 'if' block.

This commit uses separate if conditions for each class name, interpreter
dictionary, and script object.

Cought by cppcheck -

lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h:89:11:
warning: Identical inner 'if' condition is always true.
[identicalInnerCondition]

lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h:91:16:
warning: Identical inner 'if' condition is always true.
[identicalInnerCondition]

Fix #89195

---------

Co-authored-by: Shivam Gupta <shivma98.tkg at gmail.com>
---
 .../Python/Interfaces/ScriptedPythonInterface.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
index e1a3156d10afd..2667f73516ba3 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
@@ -85,14 +85,15 @@ class ScriptedPythonInterface : virtual public ScriptedInterface {
     bool has_class_name = !class_name.empty();
     bool has_interpreter_dict =
         !(llvm::StringRef(m_interpreter.GetDictionaryName()).empty());
-    if (!has_class_name && !has_interpreter_dict && !script_obj) {
-      if (!has_class_name)
-        return create_error("Missing script class name.");
-      else if (!has_interpreter_dict)
-        return create_error("Invalid script interpreter dictionary.");
-      else
-        return create_error("Missing scripting object.");
-    }
+
+    if (!has_class_name)
+      return create_error("Missing script class name.");
+
+    if (!has_interpreter_dict)
+      return create_error("Invalid script interpreter dictionary.");
+
+    if (!script_obj)
+      return create_error("Missing scripting object.");
 
     Locker py_lock(&m_interpreter, Locker::AcquireLock | Locker::NoSTDIN,
                    Locker::FreeLock);

>From 90929dd97a8f2c00148bee676bf8b44bec063d9b Mon Sep 17 00:00:00 2001
From: Balazs Benics <benicsbalazs at gmail.com>
Date: Thu, 25 Jul 2024 13:43:47 +0200
Subject: [PATCH 125/259] [analyzer] Don't invalidate the super region when a
 std object ctor runs (#100405)

CPP-5269
---
 clang/lib/StaticAnalyzer/Core/CallEvent.cpp |  19 ++++
 clang/test/Analysis/call-invalidation.cpp   | 115 ++++++++++++++++++++
 2 files changed, 134 insertions(+)

diff --git a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
index 0e317ec765ec0..eba224b8ec01c 100644
--- a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
+++ b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
@@ -923,12 +923,31 @@ SVal AnyCXXConstructorCall::getCXXThisVal() const {
   return UnknownVal();
 }
 
+static bool isWithinStdNamespace(const Decl *D) {
+  const DeclContext *DC = D->getDeclContext();
+  while (DC) {
+    if (const auto *NS = dyn_cast<NamespaceDecl>(DC);
+        NS && NS->isStdNamespace())
+      return true;
+    DC = DC->getParent();
+  }
+  return false;
+}
+
 void AnyCXXConstructorCall::getExtraInvalidatedValues(ValueList &Values,
                            RegionAndSymbolInvalidationTraits *ETraits) const {
   SVal V = getCXXThisVal();
   if (SymbolRef Sym = V.getAsSymbol(true))
     ETraits->setTrait(Sym,
                       RegionAndSymbolInvalidationTraits::TK_SuppressEscape);
+
+  // Standard classes don't reinterpret-cast and modify super regions.
+  const bool IsStdClassCtor = isWithinStdNamespace(getDecl());
+  if (const MemRegion *Obj = V.getAsRegion(); Obj && IsStdClassCtor) {
+    ETraits->setTrait(
+        Obj, RegionAndSymbolInvalidationTraits::TK_DoNotInvalidateSuperRegion);
+  }
+
   Values.push_back(V);
 }
 
diff --git a/clang/test/Analysis/call-invalidation.cpp b/clang/test/Analysis/call-invalidation.cpp
index 727217f228b05..fb2b892b31a1f 100644
--- a/clang/test/Analysis/call-invalidation.cpp
+++ b/clang/test/Analysis/call-invalidation.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify -analyzer-config eagerly-assume=false %s
 
+template <class T> void clang_analyzer_dump(T);
 void clang_analyzer_eval(bool);
 
 void usePointer(int * const *);
@@ -165,3 +166,117 @@ void testMixedConstNonConstCalls() {
   useFirstNonConstSecondConst(&(s2.x), &(s2.y));
   clang_analyzer_eval(s2.y == 1); // expected-warning{{UNKNOWN}}
 }
+
+namespace std {
+class Opaque {
+public:
+  Opaque();
+  int nested_member;
+};
+} // namespace std
+
+struct StdWrappingOpaque {
+  std::Opaque o; // first member
+  int uninit;
+};
+struct StdWrappingOpaqueSwapped {
+  int uninit; // first member
+  std::Opaque o;
+};
+
+int testStdCtorDoesNotInvalidateParentObject() {
+  StdWrappingOpaque obj;
+  int x = obj.o.nested_member; // no-garbage: std::Opaque::ctor might initialized this
+  int y = obj.uninit; // FIXME: We should have a garbage read here. Read the details.
+  // As the first member ("obj.o") is invalidated, a conjured default binding is bound
+  // to the offset 0 within cluster "obj", and this masks every uninitialized fields
+  // that follows. We need a better store with extents to fix this.
+  return x + y;
+}
+
+int testStdCtorDoesNotInvalidateParentObjectSwapped() {
+  StdWrappingOpaqueSwapped obj;
+  int x = obj.o.nested_member; // no-garbage: std::Opaque::ctor might initialized this
+  int y = obj.uninit; // expected-warning {{Assigned value is garbage or undefined}}
+  return x + y;
+}
+
+class UserProvidedOpaque {
+public:
+  UserProvidedOpaque(); // might reinterpret_cast(this)
+  int nested_member;
+};
+
+struct WrappingUserProvidedOpaque {
+  UserProvidedOpaque o; // first member
+  int uninit;
+};
+struct WrappingUserProvidedOpaqueSwapped {
+  int uninit; // first member
+  UserProvidedOpaque o;
+};
+
+int testUserProvidedCtorInvalidatesParentObject() {
+  WrappingUserProvidedOpaque obj;
+  int x = obj.o.nested_member; // no-garbage: UserProvidedOpaque::ctor might initialized this
+  int y = obj.uninit; // no-garbage: UserProvidedOpaque::ctor might reinterpret_cast(this) and write to the "uninit" member.
+  return x + y;
+}
+
+int testUserProvidedCtorInvalidatesParentObjectSwapped() {
+  WrappingUserProvidedOpaqueSwapped obj;
+  int x = obj.o.nested_member; // no-garbage: same as above
+  int y = obj.uninit; // no-garbage: same as above
+  return x + y;
+}
+
+struct WrappingStdWrappingOpaqueOuterInits {
+  int first = 1;
+  std::Opaque second;
+  int third = 3;
+  WrappingStdWrappingOpaqueOuterInits() {
+    clang_analyzer_dump(first); // expected-warning {{1 S32b}}
+    clang_analyzer_dump(second.nested_member); // expected-warning {{derived_}}
+    clang_analyzer_dump(third); // expected-warning {{3 S32b}}
+  }
+};
+
+struct WrappingUserProvidedOpaqueOuterInits {
+  int first = 1; // Potentially overwritten by UserProvidedOpaque::ctor
+  UserProvidedOpaque second; // Invalidates the object so far.
+  int third = 3; // Happens after UserProvidedOpaque::ctor, thus preserved!
+  WrappingUserProvidedOpaqueOuterInits() {
+    clang_analyzer_dump(first); // expected-warning {{derived_}}
+    clang_analyzer_dump(second.nested_member); // expected-warning {{derived_}}
+    clang_analyzer_dump(third); // expected-warning {{3 S32b}}
+  }
+};
+
+extern "C++" {
+namespace std {
+inline namespace v1 {
+namespace custom_ranges {
+struct Fancy {
+struct iterator {
+struct Opaque {
+  Opaque();
+  int nested_member;
+}; // struct Opaque
+}; // struct iterator
+}; // struct Fancy
+} // namespace custom_ranges
+} // namespace v1
+} // namespace std
+} // extern "C++"
+
+struct StdWrappingFancyOpaque {
+  int uninit;
+  std::custom_ranges::Fancy::iterator::Opaque o;
+};
+
+int testNestedStdNamespacesAndRecords() {
+  StdWrappingFancyOpaque obj;
+  int x = obj.o.nested_member; // no-garbage: ctor
+  int y = obj.uninit; // expected-warning {{Assigned value is garbage or undefined}}
+  return x + y;
+}

>From 534e2dd29f252cc13b94312895d2f4b39b54b9d4 Mon Sep 17 00:00:00 2001
From: yronglin <yronglin777 at gmail.com>
Date: Thu, 25 Jul 2024 19:44:25 +0800
Subject: [PATCH 126/259] [Clang][Interp] Visit `DecompositionDecl` and create
 a local variable (#100400)

The following code should be well-formed:
```C++
float decompose_complex(_Complex float cf) {
  static _Complex float scf;
  auto &[sre, sim] = scf;
  // ok, this is references initialized by constant expressions all the way down
  static_assert(&sre == &__real scf);
  static_assert(&sim == &__imag scf);

  auto [re, im] = cf;
  return re*re + im*im;
}

```
We should visit `DecompositionDecl` and create a local variable but not
a create a dummy value directly.

---------

Signed-off-by: yronglin <yronglin777 at gmail.com>
---
 clang/lib/AST/Interp/Compiler.cpp          | 4 ++++
 clang/test/SemaCXX/cxx1z-decomposition.cpp | 1 +
 2 files changed, 5 insertions(+)

diff --git a/clang/lib/AST/Interp/Compiler.cpp b/clang/lib/AST/Interp/Compiler.cpp
index df55d01b8b9d6..11ca7f210d2d5 100644
--- a/clang/lib/AST/Interp/Compiler.cpp
+++ b/clang/lib/AST/Interp/Compiler.cpp
@@ -5239,6 +5239,10 @@ bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
           return false;
         };
 
+        // DecompositionDecls are just proxies for us.
+        if (isa<DecompositionDecl>(VD))
+          return revisit(VD);
+
         // Visit local const variables like normal.
         if ((VD->hasGlobalStorage() || VD->isLocalVarDecl() ||
              VD->isStaticDataMember()) &&
diff --git a/clang/test/SemaCXX/cxx1z-decomposition.cpp b/clang/test/SemaCXX/cxx1z-decomposition.cpp
index 305a9ac2ebc24..19c730303625e 100644
--- a/clang/test/SemaCXX/cxx1z-decomposition.cpp
+++ b/clang/test/SemaCXX/cxx1z-decomposition.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -std=c++17 -Wc++20-extensions -verify=expected %s
 // RUN: %clang_cc1 -std=c++20 -Wpre-c++20-compat -verify=expected %s
+// RUN: %clang_cc1 -std=c++20 -Wpre-c++20-compat -fexperimental-new-constant-interpreter -verify=expected %s
 
 void use_from_own_init() {
   auto [a] = a; // expected-error {{binding 'a' cannot appear in the initializer of its own decomposition declaration}}

>From f6431f0c52689be4d6a4753d261cb6c415eff5a1 Mon Sep 17 00:00:00 2001
From: runseny <145632023+runseny at users.noreply.github.com>
Date: Thu, 25 Jul 2024 19:54:58 +0800
Subject: [PATCH 127/259] [MLIR][GPUToNVVM] support fastMath and other
 non-supported mathOp (#99890)

Support fastMath and other non-supported mathOp which only require float
operands and call libdevice function directly to nvvm.

1. lowering mathOp with fastMath attribute to correct libdevice
intrinsic.
2. some mathOp in math dialect has been lowered to libdevice now, but it
doesn't cover all mathOp. so this mr lowers all the remaining mathOp
which only require float operands.
---
 .../GPUCommon/OpToFuncCallLowering.h          |  34 ++-
 .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp        |  65 +++--
 .../Conversion/MathToROCDL/MathToROCDL.cpp    |   6 +-
 .../Conversion/GPUToNVVM/gpu-to-nvvm.mlir     | 257 ++++++++++++++++--
 4 files changed, 312 insertions(+), 50 deletions(-)

diff --git a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
index ebce2d77310ae..aca25caba4dfc 100644
--- a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
@@ -15,9 +15,9 @@
 
 namespace mlir {
 
-/// Rewriting that replace SourceOp with a CallOp to `f32Func` or `f64Func`
-/// depending on the element type that Op operates upon. The function
-/// declaration is added in case it was not added before.
+/// Rewriting that replace SourceOp with a CallOp to `f32Func` or `f64Func` or
+/// `f32ApproxFunc` depending on the element type and the fastMathFlag of that
+/// Op. The function declaration is added in case it was not added before.
 ///
 /// If the input values are of f16 type, the value is first casted to f32, the
 /// function called and then the result casted back.
@@ -27,13 +27,22 @@ namespace mlir {
 ///
 /// will be transformed into
 ///   llvm.call @__nv_expf(%arg_f32) : (f32) -> f32
+///
+/// If the fastMathFlag attribute of SourceOp is `afn` or `fast`, this Op lowers
+/// to the approximate calculation function.
+///
+/// Also example with NVVM:
+///   %exp_f32 = math.exp %arg_f32 fastmath<afn> : f32
+///
+/// will be transformed into
+///   llvm.call @__nv_fast_expf(%arg_f32) : (f32) -> f32
 template <typename SourceOp>
 struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
 public:
   explicit OpToFuncCallLowering(LLVMTypeConverter &lowering, StringRef f32Func,
-                                StringRef f64Func)
+                                StringRef f64Func, StringRef f32ApproxFunc)
       : ConvertOpToLLVMPattern<SourceOp>(lowering), f32Func(f32Func),
-        f64Func(f64Func) {}
+        f64Func(f64Func), f32ApproxFunc(f32ApproxFunc) {}
 
   LogicalResult
   matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor,
@@ -55,7 +64,8 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
     Type resultType = castedOperands.front().getType();
     Type funcType = getFunctionType(resultType, castedOperands);
     StringRef funcName =
-        getFunctionName(cast<LLVM::LLVMFunctionType>(funcType).getReturnType());
+        getFunctionName(cast<LLVM::LLVMFunctionType>(funcType).getReturnType(),
+                        op.getFastmath());
     if (funcName.empty())
       return failure();
 
@@ -90,9 +100,14 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
     return LLVM::LLVMFunctionType::get(resultType, operandTypes);
   }
 
-  StringRef getFunctionName(Type type) const {
-    if (isa<Float32Type>(type))
-      return f32Func;
+  StringRef getFunctionName(Type type, arith::FastMathFlags flag) const {
+    if (isa<Float32Type>(type)) {
+      if (((uint32_t)arith::FastMathFlags::afn & (uint32_t)flag) &&
+          !f32ApproxFunc.empty())
+        return f32ApproxFunc;
+      else
+        return f32Func;
+    }
     if (isa<Float64Type>(type))
       return f64Func;
     return "";
@@ -113,6 +128,7 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
 
   const std::string f32Func;
   const std::string f64Func;
+  const std::string f32ApproxFunc;
 };
 
 } // namespace mlir
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index fea8a0ddc7f06..faa97caacb885 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -309,10 +309,11 @@ void mlir::configureGpuToNVVMConversionLegality(ConversionTarget &target) {
   target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
   target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
   target.addIllegalDialect<gpu::GPUDialect>();
-  target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp,
-                      LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp,
-                      LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp,
-                      LLVM::SqrtOp>();
+  target.addIllegalOp<LLVM::CopySignOp, LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op,
+                      LLVM::FAbsOp, LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FMAOp,
+                      LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op, LLVM::Log2Op,
+                      LLVM::PowOp, LLVM::RoundEvenOp, LLVM::RoundOp,
+                      LLVM::SinOp, LLVM::SqrtOp>();
 
   // TODO: Remove once we support replacing non-root ops.
   target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
@@ -321,9 +322,11 @@ void mlir::configureGpuToNVVMConversionLegality(ConversionTarget &target) {
 template <typename OpTy>
 static void populateOpPatterns(LLVMTypeConverter &converter,
                                RewritePatternSet &patterns, StringRef f32Func,
-                               StringRef f64Func) {
+                               StringRef f64Func,
+                               StringRef f32ApproxFunc = "") {
   patterns.add<ScalarizeVectorOpLowering<OpTy>>(converter);
-  patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func);
+  patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func,
+                                           f32ApproxFunc);
 }
 
 void mlir::populateGpuSubgroupReduceOpLoweringPattern(
@@ -370,42 +373,68 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
       StringAttr::get(&converter.getContext(),
                       NVVM::NVVMDialect::getMaxntidAttrName()));
 
+  populateOpPatterns<arith::RemFOp>(converter, patterns, "__nv_fmodf",
+                                    "__nv_fmod");
   populateOpPatterns<math::AbsFOp>(converter, patterns, "__nv_fabsf",
                                    "__nv_fabs");
+  populateOpPatterns<math::AcosOp>(converter, patterns, "__nv_acosf",
+                                   "__nv_acos");
+  populateOpPatterns<math::AcoshOp>(converter, patterns, "__nv_acoshf",
+                                    "__nv_acosh");
+  populateOpPatterns<math::AsinOp>(converter, patterns, "__nv_asinf",
+                                   "__nv_asin");
+  populateOpPatterns<math::AsinhOp>(converter, patterns, "__nv_asinhf",
+                                    "__nv_asinh");
   populateOpPatterns<math::AtanOp>(converter, patterns, "__nv_atanf",
                                    "__nv_atan");
   populateOpPatterns<math::Atan2Op>(converter, patterns, "__nv_atan2f",
                                     "__nv_atan2");
+  populateOpPatterns<math::AtanhOp>(converter, patterns, "__nv_atanhf",
+                                    "__nv_atanh");
   populateOpPatterns<math::CbrtOp>(converter, patterns, "__nv_cbrtf",
                                    "__nv_cbrt");
   populateOpPatterns<math::CeilOp>(converter, patterns, "__nv_ceilf",
                                    "__nv_ceil");
-  populateOpPatterns<math::CosOp>(converter, patterns, "__nv_cosf", "__nv_cos");
+  populateOpPatterns<math::CopySignOp>(converter, patterns, "__nv_copysignf",
+                                       "__nv_copysign");
+  populateOpPatterns<math::CosOp>(converter, patterns, "__nv_cosf", "__nv_cos",
+                                  "__nv_fast_cosf");
+  populateOpPatterns<math::CoshOp>(converter, patterns, "__nv_coshf",
+                                   "__nv_cosh");
   populateOpPatterns<math::ErfOp>(converter, patterns, "__nv_erff", "__nv_erf");
-  populateOpPatterns<math::ExpOp>(converter, patterns, "__nv_expf", "__nv_exp");
+  populateOpPatterns<math::ExpOp>(converter, patterns, "__nv_expf", "__nv_exp",
+                                  "__nv_fast_expf");
   populateOpPatterns<math::Exp2Op>(converter, patterns, "__nv_exp2f",
                                    "__nv_exp2");
   populateOpPatterns<math::ExpM1Op>(converter, patterns, "__nv_expm1f",
                                     "__nv_expm1");
   populateOpPatterns<math::FloorOp>(converter, patterns, "__nv_floorf",
                                     "__nv_floor");
-  populateOpPatterns<arith::RemFOp>(converter, patterns, "__nv_fmodf",
-                                    "__nv_fmod");
-  populateOpPatterns<math::LogOp>(converter, patterns, "__nv_logf", "__nv_log");
+  populateOpPatterns<math::FmaOp>(converter, patterns, "__nv_fmaf", "__nv_fma");
+  populateOpPatterns<math::LogOp>(converter, patterns, "__nv_logf", "__nv_log",
+                                  "__nv_fast_logf");
+  populateOpPatterns<math::Log10Op>(converter, patterns, "__nv_log10f",
+                                    "__nv_log10", "__nv_fast_log10f");
   populateOpPatterns<math::Log1pOp>(converter, patterns, "__nv_log1pf",
                                     "__nv_log1p");
-  populateOpPatterns<math::Log10Op>(converter, patterns, "__nv_log10f",
-                                    "__nv_log10");
   populateOpPatterns<math::Log2Op>(converter, patterns, "__nv_log2f",
-                                   "__nv_log2");
-  populateOpPatterns<math::PowFOp>(converter, patterns, "__nv_powf",
-                                   "__nv_pow");
+                                   "__nv_log2", "__nv_fast_log2f");
+  populateOpPatterns<math::PowFOp>(converter, patterns, "__nv_powf", "__nv_pow",
+                                   "__nv_fast_powf");
+  populateOpPatterns<math::RoundOp>(converter, patterns, "__nv_roundf",
+                                    "__nv_round");
+  populateOpPatterns<math::RoundEvenOp>(converter, patterns, "__nv_rintf",
+                                        "__nv_rint");
   populateOpPatterns<math::RsqrtOp>(converter, patterns, "__nv_rsqrtf",
                                     "__nv_rsqrt");
-  populateOpPatterns<math::SinOp>(converter, patterns, "__nv_sinf", "__nv_sin");
+  populateOpPatterns<math::SinOp>(converter, patterns, "__nv_sinf", "__nv_sin",
+                                  "__nv_fast_sinf");
+  populateOpPatterns<math::SinhOp>(converter, patterns, "__nv_sinhf",
+                                   "__nv_sinh");
   populateOpPatterns<math::SqrtOp>(converter, patterns, "__nv_sqrtf",
                                    "__nv_sqrt");
+  populateOpPatterns<math::TanOp>(converter, patterns, "__nv_tanf", "__nv_tan",
+                                  "__nv_fast_tanf");
   populateOpPatterns<math::TanhOp>(converter, patterns, "__nv_tanhf",
                                    "__nv_tanh");
-  populateOpPatterns<math::TanOp>(converter, patterns, "__nv_tanf", "__nv_tan");
 }
diff --git a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
index 03c7ce5dac0d1..7de6971ba2ee7 100644
--- a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
+++ b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
@@ -38,9 +38,11 @@ using namespace mlir;
 template <typename OpTy>
 static void populateOpPatterns(LLVMTypeConverter &converter,
                                RewritePatternSet &patterns, StringRef f32Func,
-                               StringRef f64Func) {
+                               StringRef f64Func,
+                               StringRef f32ApproxFunc = "") {
   patterns.add<ScalarizeVectorOpLowering<OpTy>>(converter);
-  patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func);
+  patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func,
+                                           f32ApproxFunc);
 }
 
 void mlir::populateMathToROCDLConversionPatterns(LLVMTypeConverter &converter,
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index d914790c05fe0..c23b11e46b24c 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -254,13 +254,16 @@ gpu.module @test_module_9 {
 gpu.module @test_module_10 {
   // CHECK: llvm.func @__nv_cosf(f32) -> f32
   // CHECK: llvm.func @__nv_cos(f64) -> f64
+  // CHECK: llvm.func @__nv_fast_cosf(f32) -> f32
   // CHECK-LABEL: func @gpu_cos
-  func.func @gpu_cos(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+  func.func @gpu_cos(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64, f32) {
     %result32 = math.cos %arg_f32 : f32
     // CHECK: llvm.call @__nv_cosf(%{{.*}}) : (f32) -> f32
     %result64 = math.cos %arg_f64 : f64
     // CHECK: llvm.call @__nv_cos(%{{.*}}) : (f64) -> f64
-    func.return %result32, %result64 : f32, f64
+    %result32Fast = math.cos %arg_f32 fastmath<afn> : f32
+    // CHECK: llvm.call @__nv_fast_cosf(%{{.*}}) : (f32) -> f32
+    func.return %result32, %result64, %result32Fast : f32, f64, f32
   }
 }
 
@@ -268,13 +271,16 @@ gpu.module @test_module_10 {
 gpu.module @test_module_11 {
   // CHECK: llvm.func @__nv_expf(f32) -> f32
   // CHECK: llvm.func @__nv_exp(f64) -> f64
+  // CHECK: llvm.func @__nv_fast_expf(f32) -> f32
   // CHECK-LABEL: func @gpu_exp
-  func.func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+  func.func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64, f32) {
     %result32 = math.exp %arg_f32 : f32
     // CHECK: llvm.call @__nv_expf(%{{.*}}) : (f32) -> f32
     %result64 = math.exp %arg_f64 : f64
     // CHECK: llvm.call @__nv_exp(%{{.*}}) : (f64) -> f64
-    func.return %result32, %result64 : f32, f64
+    %result32Fast = math.exp %arg_f32 fastmath<fast> : f32
+    // CHECK: llvm.call @__nv_fast_expf(%{{.*}}) : (f32) -> f32
+    func.return %result32, %result64, %result32Fast : f32, f64, f32
   }
 }
 
@@ -297,13 +303,16 @@ gpu.module @test_module_12 {
 gpu.module @test_module_13 {
   // CHECK: llvm.func @__nv_logf(f32) -> f32
   // CHECK: llvm.func @__nv_log(f64) -> f64
+  // CHECK: llvm.func @__nv_fast_logf(f32) -> f32
   // CHECK-LABEL: func @gpu_log
-  func.func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+  func.func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64, f32) {
     %result32 = math.log %arg_f32 : f32
     // CHECK: llvm.call @__nv_logf(%{{.*}}) : (f32) -> f32
     %result64 = math.log %arg_f64 : f64
     // CHECK: llvm.call @__nv_log(%{{.*}}) : (f64) -> f64
-    func.return %result32, %result64 : f32, f64
+    %result32Fast = math.log %arg_f32 fastmath<afn> : f32
+    // CHECK: llvm.call @__nv_fast_logf(%{{.*}}) : (f32) -> f32
+    func.return %result32, %result64, %result32Fast : f32, f64, f32
   }
 }
 
@@ -312,13 +321,16 @@ gpu.module @test_module_13 {
 gpu.module @test_module_14 {
   // CHECK: llvm.func @__nv_log10f(f32) -> f32
   // CHECK: llvm.func @__nv_log10(f64) -> f64
+  // CHECK: llvm.func @__nv_fast_log10f(f32) -> f32
   // CHECK-LABEL: func @gpu_log10
-  func.func @gpu_log10(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+  func.func @gpu_log10(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64, f32) {
     %result32 = math.log10 %arg_f32 : f32
     // CHECK: llvm.call @__nv_log10f(%{{.*}}) : (f32) -> f32
     %result64 = math.log10 %arg_f64 : f64
     // CHECK: llvm.call @__nv_log10(%{{.*}}) : (f64) -> f64
-    func.return %result32, %result64 : f32, f64
+    %result32Fast = math.log10 %arg_f32 fastmath<afn> : f32
+    // CHECK: llvm.call @__nv_fast_log10f(%{{.*}}) : (f32) -> f32
+    func.return %result32, %result64, %result32Fast : f32, f64, f32
   }
 }
 
@@ -342,13 +354,16 @@ gpu.module @test_module_15 {
 gpu.module @test_module_16 {
   // CHECK: llvm.func @__nv_log2f(f32) -> f32
   // CHECK: llvm.func @__nv_log2(f64) -> f64
+  // CHECK: llvm.func @__nv_fast_log2f(f32) -> f32
   // CHECK-LABEL: func @gpu_log2
-  func.func @gpu_log2(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+  func.func @gpu_log2(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64, f32) {
     %result32 = math.log2 %arg_f32 : f32
     // CHECK: llvm.call @__nv_log2f(%{{.*}}) : (f32) -> f32
     %result64 = math.log2 %arg_f64 : f64
     // CHECK: llvm.call @__nv_log2(%{{.*}}) : (f64) -> f64
-    func.return %result32, %result64 : f32, f64
+    %result32Fast = math.log2 %arg_f32 fastmath<fast> : f32
+    // CHECK: llvm.call @__nv_fast_log2f(%{{.*}}) : (f32) -> f32
+    func.return %result32, %result64, %result32Fast : f32, f64, f32
   }
 }
 
@@ -357,13 +372,16 @@ gpu.module @test_module_16 {
 gpu.module @test_module_17 {
   // CHECK: llvm.func @__nv_sinf(f32) -> f32
   // CHECK: llvm.func @__nv_sin(f64) -> f64
+  // CHECK: llvm.func @__nv_fast_sinf(f32) -> f32
   // CHECK-LABEL: func @gpu_sin
-  func.func @gpu_sin(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+  func.func @gpu_sin(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64, f32) {
     %result32 = math.sin %arg_f32 : f32
     // CHECK: llvm.call @__nv_sinf(%{{.*}}) : (f32) -> f32
     %result64 = math.sin %arg_f64 : f64
     // CHECK: llvm.call @__nv_sin(%{{.*}}) : (f64) -> f64
-    func.return %result32, %result64 : f32, f64
+    %result32Fast = math.sin %arg_f32 fastmath<fast> : f32
+    // CHECK: llvm.call @__nv_fast_sinf(%{{.*}}) : (f32) -> f32
+    func.return %result32, %result64, %result32Fast : f32, f64, f32
   }
 }
 
@@ -372,8 +390,9 @@ gpu.module @test_module_17 {
 gpu.module @test_module_18 {
   // CHECK: llvm.func @__nv_tanf(f32) -> f32
   // CHECK: llvm.func @__nv_tan(f64) -> f64
+  // CHECK: llvm.func @__nv_fast_tanf(f32) -> f32
   // CHECK-LABEL: func @gpu_tan
-  func.func @gpu_tan(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64) {
+  func.func @gpu_tan(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f32, f64, f32) {
     %result16 = math.tan %arg_f16 : f16
     // CHECK: llvm.fpext %{{.*}} : f16 to f32
     // CHECK-NEXT: llvm.call @__nv_tanf(%{{.*}}) : (f32) -> f32
@@ -382,7 +401,9 @@ gpu.module @test_module_18 {
     // CHECK: llvm.call @__nv_tanf(%{{.*}}) : (f32) -> f32
     %result64 = math.tan %arg_f64 : f64
     // CHECK: llvm.call @__nv_tan(%{{.*}}) : (f64) -> f64
-    func.return %result16, %result32, %result64 : f16, f32, f64
+    %result32Fast = math.tan %arg_f32 fastmath<fast> : f32
+    // CHECK: llvm.call @__nv_fast_tanf(%{{.*}}) : (f32) -> f32
+    func.return %result16, %result32, %result64, %result32Fast : f16, f32, f64, f32
   }
 }
 
@@ -494,13 +515,16 @@ gpu.module @test_module_24 {
   // CHECK: test.symbol_scope
   // CHECK: llvm.func @__nv_expf(f32) -> f32
   // CHECK: llvm.func @__nv_exp(f64) -> f64
+  // CHECK: llvm.func @__nv_fast_expf(f32) -> f32
   // CHECK-LABEL: func @gpu_exp
-    func.func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    func.func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64, f32) {
       %result32 = math.exp %arg_f32 : f32
       // CHECK: llvm.call @__nv_expf(%{{.*}}) : (f32) -> f32
       %result64 = math.exp %arg_f64 : f64
       // CHECK: llvm.call @__nv_exp(%{{.*}}) : (f64) -> f64
-      func.return %result32, %result64 : f32, f64
+      %result32Fast = math.exp %arg_f32 fastmath<afn> : f32
+      // CHECK: llvm.call @__nv_fast_expf(%{{.*}}) : (f32) -> f32
+      func.return %result32, %result64, %result32Fast : f32, f64, f32
     }
     "test.finish" () : () -> ()
   }) : () -> ()
@@ -526,13 +550,16 @@ gpu.module @test_module_25 {
 gpu.module @test_module_26 {
   // CHECK: llvm.func @__nv_powf(f32, f32) -> f32
   // CHECK: llvm.func @__nv_pow(f64, f64) -> f64
+  // CHECK: llvm.func @__nv_fast_powf(f32, f32) -> f32
   // CHECK-LABEL: func @gpu_pow
-  func.func @gpu_pow(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+  func.func @gpu_pow(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64, f32) {
     %result32 = math.powf %arg_f32, %arg_f32 : f32
     // CHECK: llvm.call @__nv_powf(%{{.*}}, %{{.*}}) : (f32, f32) -> f32
     %result64 = math.powf %arg_f64, %arg_f64 : f64
     // CHECK: llvm.call @__nv_pow(%{{.*}}, %{{.*}}) : (f64, f64) -> f64
-    func.return %result32, %result64 : f32, f64
+    %result32Fast = math.powf %arg_f32, %arg_f32 fastmath<fast> : f32
+    // CHECK: llvm.call @__nv_fast_powf(%{{.*}}, %{{.*}}) : (f32, f32) -> f32
+    func.return %result32, %result64, %result32Fast : f32, f64, f32
   }
 }
 
@@ -701,6 +728,194 @@ gpu.module @test_module_34 {
   }
 }
 
+gpu.module @test_module_35 {
+  // CHECK: llvm.func @__nv_acosf(f32) -> f32
+  // CHECK: llvm.func @__nv_acos(f64) -> f64
+  // CHECK-LABEL: func @gpu_acos
+  func.func @gpu_acos(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.acos %arg_f32 : f32
+    // CHECK: llvm.call @__nv_acosf(%{{.*}}) : (f32) -> f32
+    %result64 = math.acos %arg_f64 : f64
+    // CHECK: llvm.call @__nv_acos(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_36 {
+  // CHECK: llvm.func @__nv_acoshf(f32) -> f32
+  // CHECK: llvm.func @__nv_acosh(f64) -> f64
+  // CHECK-LABEL: func @gpu_acosh
+  func.func @gpu_acosh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.acosh %arg_f32 : f32
+    // CHECK: llvm.call @__nv_acoshf(%{{.*}}) : (f32) -> f32
+    %result64 = math.acosh %arg_f64 : f64
+    // CHECK: llvm.call @__nv_acosh(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_37 {
+  // CHECK: llvm.func @__nv_asinf(f32) -> f32
+  // CHECK: llvm.func @__nv_asin(f64) -> f64
+  // CHECK-LABEL: func @gpu_asin
+  func.func @gpu_asin(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.asin %arg_f32 : f32
+    // CHECK: llvm.call @__nv_asinf(%{{.*}}) : (f32) -> f32
+    %result64 = math.asin %arg_f64 : f64
+    // CHECK: llvm.call @__nv_asin(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_38 {
+  // CHECK: llvm.func @__nv_asinhf(f32) -> f32
+  // CHECK: llvm.func @__nv_asinh(f64) -> f64
+  // CHECK-LABEL: func @gpu_asinh
+  func.func @gpu_asinh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.asinh %arg_f32 : f32
+    // CHECK: llvm.call @__nv_asinhf(%{{.*}}) : (f32) -> f32
+    %result64 = math.asinh %arg_f64 : f64
+    // CHECK: llvm.call @__nv_asinh(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_39 {
+  // CHECK: llvm.func @__nv_atanhf(f32) -> f32
+  // CHECK: llvm.func @__nv_atanh(f64) -> f64
+  // CHECK-LABEL: func @gpu_atanh
+  func.func @gpu_atanh(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64)
+      -> (f16, f32, f64) {
+    %result16 = math.atanh %arg_f16 : f16
+    // CHECK: llvm.fpext %{{.*}} : f16 to f32
+    // CHECK-NEXT: llvm.call @__nv_atanhf(%{{.*}}) : (f32) -> f32
+    // CHECK-NEXT: llvm.fptrunc %{{.*}} : f32 to f16
+    %result32 = math.atanh %arg_f32 : f32
+    // CHECK: llvm.call @__nv_atanhf(%{{.*}}) : (f32) -> f32
+    %result64 = math.atanh %arg_f64 : f64
+    // CHECK: llvm.call @__nv_atanh(%{{.*}}) : (f64) -> f64
+    func.return %result16, %result32, %result64 : f16, f32, f64
+  }
+}
+
+gpu.module @test_module_40 {
+  // CHECK: llvm.func @__nv_copysignf(f32, f32) -> f32
+  // CHECK: llvm.func @__nv_copysign(f64, f64) -> f64
+  // CHECK-LABEL: func @gpu_copysign
+  func.func @gpu_copysign(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.copysign %arg_f32, %arg_f32 : f32
+    // CHECK: llvm.call @__nv_copysignf(%{{.*}}, %{{.*}}) : (f32, f32) -> f32
+    %result64 = math.copysign %arg_f64, %arg_f64 : f64
+    // CHECK: llvm.call @__nv_copysign(%{{.*}}, %{{.*}}) : (f64, f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_41 {
+  // CHECK: llvm.func @__nv_coshf(f32) -> f32
+  // CHECK: llvm.func @__nv_cosh(f64) -> f64
+  // CHECK-LABEL: func @gpu_cosh
+  func.func @gpu_cosh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.cosh %arg_f32 : f32
+    // CHECK: llvm.call @__nv_coshf(%{{.*}}) : (f32) -> f32
+    %result64 = math.cosh %arg_f64 : f64
+    // CHECK: llvm.call @__nv_cosh(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_42 {
+  // CHECK: llvm.func @__nv_fmaf(f32, f32, f32) -> f32
+  // CHECK: llvm.func @__nv_fma(f64, f64, f64) -> f64
+  // CHECK-LABEL: func @gpu_fma
+  func.func @gpu_fma(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.fma %arg_f32, %arg_f32, %arg_f32 : f32
+    // CHECK: llvm.call @__nv_fmaf(%{{.*}}, %{{.*}}, %{{.*}}) : (f32, f32, f32) -> f32
+    %result64 = math.fma %arg_f64, %arg_f64, %arg_f64 : f64
+    // CHECK: llvm.call @__nv_fma(%{{.*}}, %{{.*}}, %{{.*}}) : (f64, f64, f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_43 {
+  // CHECK: llvm.func @__nv_roundf(f32) -> f32
+  // CHECK: llvm.func @__nv_round(f64) -> f64
+  // CHECK-LABEL: func @gpu_round
+  func.func @gpu_round(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.round %arg_f32 : f32
+    // CHECK: llvm.call @__nv_roundf(%{{.*}}) : (f32) -> f32
+    %result64 = math.round %arg_f64 : f64
+    // CHECK: llvm.call @__nv_round(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_44 {
+  // CHECK: llvm.func @__nv_rintf(f32) -> f32
+  // CHECK: llvm.func @__nv_rint(f64) -> f64
+  // CHECK-LABEL: func @gpu_roundeven
+  func.func @gpu_roundeven(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.roundeven %arg_f32 : f32
+    // CHECK: llvm.call @__nv_rintf(%{{.*}}) : (f32) -> f32
+    %result64 = math.roundeven %arg_f64 : f64
+    // CHECK: llvm.call @__nv_rint(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_45 {
+  // CHECK: llvm.func @__nv_sinhf(f32) -> f32
+  // CHECK: llvm.func @__nv_sinh(f64) -> f64
+  // CHECK-LABEL: func @gpu_sinh
+  func.func @gpu_sinh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.sinh %arg_f32 : f32
+    // CHECK: llvm.call @__nv_sinhf(%{{.*}}) : (f32) -> f32
+    %result64 = math.sinh %arg_f64 : f64
+    // CHECK: llvm.call @__nv_sinh(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_46 {
+  // CHECK: llvm.func @__nv_coshf(f32) -> f32
+  // CHECK: llvm.func @__nv_cosh(f64) -> f64
+  // CHECK-LABEL: func @gpu_cosh_with_fastmath
+  func.func @gpu_cosh_with_fastmath(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.cosh %arg_f32 fastmath<fast> : f32
+    // CHECK: llvm.call @__nv_coshf(%{{.*}}) : (f32) -> f32
+    %result64 = math.cosh %arg_f64 fastmath<afn> : f64
+    // CHECK: llvm.call @__nv_cosh(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_47 {
+  // CHECK: llvm.func @__nv_sinhf(f32) -> f32
+  // CHECK: llvm.func @__nv_sinh(f64) -> f64
+  // CHECK-LABEL: func @gpu_sinh_with_fastmath
+  func.func @gpu_sinh_with_fastmath(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = math.sinh %arg_f32 fastmath<contract> : f32
+    // CHECK: llvm.call @__nv_sinhf(%{{.*}}) : (f32) -> f32
+    %result64 = math.sinh %arg_f64 fastmath<none> : f64
+    // CHECK: llvm.call @__nv_sinh(%{{.*}}) : (f64) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}
+
+gpu.module @test_module_48 {
+  // CHECK: llvm.func @__nv_expf(f32) -> f32
+  // CHECK: llvm.func @__nv_exp(f64) -> f64
+  // CHECK-LABEL: func @gpu_exp_with_fastmath
+  func.func @gpu_exp_with_fastmath(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64, f32) {
+    %result32 = math.exp %arg_f32 fastmath<reassoc>: f32
+    // CHECK: llvm.call @__nv_expf(%{{.*}}) : (f32) -> f32
+    %result64 = math.exp %arg_f64 fastmath<contract>: f64
+    // CHECK: llvm.call @__nv_exp(%{{.*}}) : (f64) -> f64
+    %result32Fast = math.exp %arg_f32 fastmath<ninf> : f32
+    // CHECK: llvm.call @__nv_expf(%{{.*}}) : (f32) -> f32
+    func.return %result32, %result64, %result32Fast : f32, f64, f32
+  }
+}
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%toplevel_module: !transform.any_op {transform.readonly}) {
@@ -729,9 +944,9 @@ module attributes {transform.with_named_sequence} {
       legal_dialects = ["llvm", "memref", "nvvm", "test"],
       legal_ops = ["func.func", "gpu.module", "gpu.module_end", "gpu.yield"],
       illegal_dialects = ["gpu"],
-      illegal_ops = ["llvm.cos", "llvm.exp", "llvm.exp2", "llvm.fabs", "llvm.fceil",
-                    "llvm.ffloor", "llvm.log", "llvm.log10", "llvm.log2","llvm.pow",
-                    "llvm.sin", "llvm.sqrt"],
+      illegal_ops = ["llvm.copysign", "llvm.cos", "llvm.exp", "llvm.exp2", "llvm.fabs", "llvm.fceil",
+                    "llvm.ffloor", "llvm.fma", "llvm.frem", "llvm.log", "llvm.log10", "llvm.log2", "llvm.pow",
+                    "llvm.roundeven", "llvm.round", "llvm.sin", "llvm.sqrt"],
       partial_conversion
     } : !transform.any_op
     transform.yield

>From fc9b9e809381cf9232e5c81439f693d4db47efb3 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph at gmail.com>
Date: Thu, 25 Jul 2024 14:07:07 +0200
Subject: [PATCH 128/259] Revert "MTM: fix issues after cursory reading"
 (#100559)

Reverts llvm/llvm-project#100404

This broke the gcc7 build here:
https://lab.llvm.org/buildbot/#/builders/116/builds/1724
---
 llvm/lib/CodeGen/MachineTraceMetrics.cpp | 101 ++++++++++++-----------
 1 file changed, 55 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index dd1faff355b52..bf3add010574b 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -24,11 +24,17 @@
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <tuple>
+#include <utility>
 
 using namespace llvm;
 
@@ -127,7 +133,7 @@ MachineTraceMetrics::getResources(const MachineBasicBlock *MBB) {
 
   // Scale the resource cycles so they are comparable.
   unsigned PROffset = MBB->getNumber() * PRKinds;
-  for (unsigned K = 0; K < PRKinds; ++K)
+  for (unsigned K = 0; K != PRKinds; ++K)
     ProcReleaseAtCycles[PROffset + K] =
       PRCycles[K] * SchedModel.getResourceFactor(K);
 
@@ -140,14 +146,15 @@ MachineTraceMetrics::getProcReleaseAtCycles(unsigned MBBNum) const {
          "getResources() must be called before getProcReleaseAtCycles()");
   unsigned PRKinds = SchedModel.getNumProcResourceKinds();
   assert((MBBNum+1) * PRKinds <= ProcReleaseAtCycles.size());
-  return ArrayRef{ProcReleaseAtCycles.data() + MBBNum * PRKinds, PRKinds};
+  return ArrayRef(ProcReleaseAtCycles.data() + MBBNum * PRKinds, PRKinds);
 }
 
 //===----------------------------------------------------------------------===//
 //                         Ensemble utility functions
 //===----------------------------------------------------------------------===//
 
-MachineTraceMetrics::Ensemble::Ensemble(MachineTraceMetrics *CT) : MTM(*CT) {
+MachineTraceMetrics::Ensemble::Ensemble(MachineTraceMetrics *ct)
+  : MTM(*ct) {
   BlockInfo.resize(MTM.BlockInfo.size());
   unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
   ProcResourceDepths.resize(MTM.BlockInfo.size() * PRKinds);
@@ -191,7 +198,7 @@ computeDepthResources(const MachineBasicBlock *MBB) {
   // Compute per-resource depths.
   ArrayRef<unsigned> PredPRDepths = getProcResourceDepths(PredNum);
   ArrayRef<unsigned> PredPRCycles = MTM.getProcReleaseAtCycles(PredNum);
-  for (unsigned K = 0; K < PRKinds; ++K)
+  for (unsigned K = 0; K != PRKinds; ++K)
     ProcResourceDepths[PROffset + K] = PredPRDepths[K] + PredPRCycles[K];
 }
 
@@ -224,7 +231,7 @@ computeHeightResources(const MachineBasicBlock *MBB) {
 
   // Compute per-resource heights.
   ArrayRef<unsigned> SuccPRHeights = getProcResourceHeights(SuccNum);
-  for (unsigned K = 0; K < PRKinds; ++K)
+  for (unsigned K = 0; K != PRKinds; ++K)
     ProcResourceHeights[PROffset + K] = SuccPRHeights[K] + PRCycles[K];
 }
 
@@ -257,7 +264,7 @@ MachineTraceMetrics::Ensemble::
 getProcResourceDepths(unsigned MBBNum) const {
   unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
   assert((MBBNum+1) * PRKinds <= ProcResourceDepths.size());
-  return ArrayRef{ProcResourceDepths.data() + MBBNum * PRKinds, PRKinds};
+  return ArrayRef(ProcResourceDepths.data() + MBBNum * PRKinds, PRKinds);
 }
 
 /// Get an array of processor resource heights for MBB. Indexed by processor
@@ -270,7 +277,7 @@ MachineTraceMetrics::Ensemble::
 getProcResourceHeights(unsigned MBBNum) const {
   unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
   assert((MBBNum+1) * PRKinds <= ProcResourceHeights.size());
-  return ArrayRef{ProcResourceHeights.data() + MBBNum * PRKinds, PRKinds};
+  return ArrayRef(ProcResourceHeights.data() + MBBNum * PRKinds, PRKinds);
 }
 
 //===----------------------------------------------------------------------===//
@@ -307,8 +314,8 @@ class MinInstrCountEnsemble : public MachineTraceMetrics::Ensemble {
   const MachineBasicBlock *pickTraceSucc(const MachineBasicBlock*) override;
 
 public:
-  MinInstrCountEnsemble(MachineTraceMetrics *MTM)
-      : MachineTraceMetrics::Ensemble(MTM) {}
+  MinInstrCountEnsemble(MachineTraceMetrics *mtm)
+    : MachineTraceMetrics::Ensemble(mtm) {}
 };
 
 /// Pick only the current basic block for the trace and do not choose any
@@ -388,15 +395,15 @@ MinInstrCountEnsemble::pickTraceSucc(const MachineBasicBlock *MBB) {
 
 // Get an Ensemble sub-class for the requested trace strategy.
 MachineTraceMetrics::Ensemble *
-MachineTraceMetrics::getEnsemble(MachineTraceStrategy Strategy) {
-  assert(Strategy < MachineTraceStrategy::TS_NumStrategies &&
+MachineTraceMetrics::getEnsemble(MachineTraceStrategy strategy) {
+  assert(strategy < MachineTraceStrategy::TS_NumStrategies &&
          "Invalid trace strategy enum");
-  Ensemble *&E = Ensembles[static_cast<size_t>(Strategy)];
+  Ensemble *&E = Ensembles[static_cast<size_t>(strategy)];
   if (E)
     return E;
 
   // Allocate new Ensemble on demand.
-  switch (Strategy) {
+  switch (strategy) {
   case MachineTraceStrategy::TS_MinInstrCount:
     return (E = new MinInstrCountEnsemble(this));
   case MachineTraceStrategy::TS_Local:
@@ -441,9 +448,8 @@ struct LoopBounds {
   const MachineLoopInfo *Loops;
   bool Downward = false;
 
-  LoopBounds(MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> Blocks,
-             const MachineLoopInfo *Loops)
-      : Blocks(Blocks), Loops(Loops) {}
+  LoopBounds(MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> blocks,
+             const MachineLoopInfo *loops) : Blocks(blocks), Loops(loops) {}
 };
 
 } // end anonymous namespace
@@ -457,7 +463,7 @@ class po_iterator_storage<LoopBounds, true> {
   LoopBounds &LB;
 
 public:
-  po_iterator_storage(LoopBounds &LB) : LB(LB) {}
+  po_iterator_storage(LoopBounds &lb) : LB(lb) {}
 
   void finishPostorder(const MachineBasicBlock*) {}
 
@@ -540,7 +546,7 @@ MachineTraceMetrics::Ensemble::invalidate(const MachineBasicBlock *BadMBB) {
   if (BadTBI.hasValidHeight()) {
     BadTBI.invalidateHeight();
     WorkList.push_back(BadMBB);
-    while (!WorkList.empty()) {
+    do {
       const MachineBasicBlock *MBB = WorkList.pop_back_val();
       LLVM_DEBUG(dbgs() << "Invalidate " << printMBBReference(*MBB) << ' '
                         << getName() << " height.\n");
@@ -558,14 +564,14 @@ MachineTraceMetrics::Ensemble::invalidate(const MachineBasicBlock *BadMBB) {
         // Verify that TBI.Succ is actually a *I successor.
         assert((!TBI.Succ || Pred->isSuccessor(TBI.Succ)) && "CFG changed");
       }
-    }
+    } while (!WorkList.empty());
   }
 
   // Invalidate depth resources of blocks below MBB.
   if (BadTBI.hasValidDepth()) {
     BadTBI.invalidateDepth();
     WorkList.push_back(BadMBB);
-    while (!WorkList.empty()) {
+    do {
       const MachineBasicBlock *MBB = WorkList.pop_back_val();
       LLVM_DEBUG(dbgs() << "Invalidate " << printMBBReference(*MBB) << ' '
                         << getName() << " depth.\n");
@@ -583,7 +589,7 @@ MachineTraceMetrics::Ensemble::invalidate(const MachineBasicBlock *BadMBB) {
         // Verify that TBI.Pred is actually a *I predecessor.
         assert((!TBI.Pred || Succ->isPredecessor(TBI.Pred)) && "CFG changed");
       }
-    }
+    } while (!WorkList.empty());
   }
 
   // Clear any per-instruction data. We only have to do this for BadMBB itself
@@ -599,7 +605,7 @@ void MachineTraceMetrics::Ensemble::verify() const {
 #ifndef NDEBUG
   assert(BlockInfo.size() == MTM.MF->getNumBlockIDs() &&
          "Outdated BlockInfo size");
-  for (unsigned Num = 0; Num < BlockInfo.size(); ++Num) {
+  for (unsigned Num = 0, e = BlockInfo.size(); Num != e; ++Num) {
     const TraceBlockInfo &TBI = BlockInfo[Num];
     if (TBI.hasValidDepth() && TBI.Pred) {
       const MachineBasicBlock *MBB = MTM.MF->getBlockNumbered(Num);
@@ -680,7 +686,7 @@ static bool getDataDeps(const MachineInstr &UseMI,
     }
     // Collect virtual register reads.
     if (MO.readsReg())
-      Deps.emplace_back(MRI, Reg, MO.getOperandNo());
+      Deps.push_back(DataDep(MRI, Reg, MO.getOperandNo()));
   }
   return HasPhysRegs;
 }
@@ -696,10 +702,10 @@ static void getPHIDeps(const MachineInstr &UseMI,
   if (!Pred)
     return;
   assert(UseMI.isPHI() && UseMI.getNumOperands() % 2 && "Bad PHI");
-  for (unsigned Idx = 1; Idx < UseMI.getNumOperands(); Idx += 2) {
-    if (UseMI.getOperand(Idx + 1).getMBB() == Pred) {
-      Register Reg = UseMI.getOperand(Idx).getReg();
-      Deps.emplace_back(MRI, Reg, Idx);
+  for (unsigned i = 1; i != UseMI.getNumOperands(); i += 2) {
+    if (UseMI.getOperand(i + 1).getMBB() == Pred) {
+      Register Reg = UseMI.getOperand(i).getReg();
+      Deps.push_back(DataDep(MRI, Reg, i));
       return;
     }
   }
@@ -733,7 +739,7 @@ static void updatePhysDepsDownwards(const MachineInstr *UseMI,
       SparseSet<LiveRegUnit>::iterator I = RegUnits.find(Unit);
       if (I == RegUnits.end())
         continue;
-      Deps.emplace_back(I->MI, I->Op, MO.getOperandNo());
+      Deps.push_back(DataDep(I->MI, I->Op, MO.getOperandNo()));
       break;
     }
   }
@@ -846,14 +852,14 @@ computeInstrDepths(const MachineBasicBlock *MBB) {
   // implies Head->HasValidInstrDepths, so we only need to start from the first
   // block in the trace that needs to be recomputed.
   SmallVector<const MachineBasicBlock*, 8> Stack;
-  while (MBB) {
+  do {
     TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
     assert(TBI.hasValidDepth() && "Incomplete trace");
     if (TBI.HasValidInstrDepths)
       break;
     Stack.push_back(MBB);
     MBB = TBI.Pred;
-  }
+  } while (MBB);
 
   // FIXME: If MBB is non-null at this point, it is the last pre-computed block
   // in the trace. We should track any live-out physregs that were defined in
@@ -874,7 +880,7 @@ computeInstrDepths(const MachineBasicBlock *MBB) {
     LLVM_DEBUG({
       dbgs() << format("%7u Instructions\n", TBI.InstrDepth);
       ArrayRef<unsigned> PRDepths = getProcResourceDepths(MBB->getNumber());
-      for (unsigned K = 0; K < PRDepths.size(); ++K)
+      for (unsigned K = 0; K != PRDepths.size(); ++K)
         if (PRDepths[K]) {
           unsigned Factor = MTM.SchedModel.getResourceFactor(K);
           dbgs() << format("%6uc @ ", MTM.getCycles(PRDepths[K]))
@@ -963,8 +969,10 @@ static bool pushDepHeight(const DataDep &Dep, const MachineInstr &UseMI,
                                                   Dep.UseOp);
 
   // Update Heights[DefMI] to be the maximum height seen.
-  const auto &[I, Inserted] = Heights.insert({Dep.DefMI, UseHeight});
-  if (Inserted)
+  MIHeightMap::iterator I;
+  bool New;
+  std::tie(I, New) = Heights.insert(std::make_pair(Dep.DefMI, UseHeight));
+  if (New)
     return true;
 
   // DefMI has been pushed before. Give it the max height.
@@ -1002,7 +1010,7 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
   // The bottom of the trace may already be computed.
   // Find the blocks that need updating.
   SmallVector<const MachineBasicBlock*, 8> Stack;
-  while (MBB) {
+  do {
     TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
     assert(TBI.hasValidHeight() && "Incomplete trace");
     if (TBI.HasValidInstrHeights)
@@ -1010,7 +1018,7 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
     Stack.push_back(MBB);
     TBI.LiveIns.clear();
     MBB = TBI.Succ;
-  }
+  } while (MBB);
 
   // As we move upwards in the trace, keep track of instructions that are
   // required by deeper trace instructions. Map MI -> height required so far.
@@ -1052,7 +1060,7 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
     LLVM_DEBUG({
       dbgs() << format("%7u Instructions\n", TBI.InstrHeight);
       ArrayRef<unsigned> PRHeights = getProcResourceHeights(MBB->getNumber());
-      for (unsigned K = 0; K < PRHeights.size(); ++K)
+      for (unsigned K = 0; K != PRHeights.size(); ++K)
         if (PRHeights[K]) {
           unsigned Factor = MTM.SchedModel.getResourceFactor(K);
           dbgs() << format("%6uc @ ", MTM.getCycles(PRHeights[K]))
@@ -1137,7 +1145,7 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
 
     // Transfer the live regunits to the live-in list.
     for (const LiveRegUnit &RU : RegUnits) {
-      TBI.LiveIns.emplace_back(RU.RegUnit, RU.Cycle);
+      TBI.LiveIns.push_back(LiveInReg(RU.RegUnit, RU.Cycle));
       LLVM_DEBUG(dbgs() << ' ' << printRegUnit(RU.RegUnit, MTM.TRI) << '@'
                         << RU.Cycle);
     }
@@ -1197,7 +1205,7 @@ unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const {
   ArrayRef<unsigned> PRDepths = TE.getProcResourceDepths(getBlockNum());
   if (Bottom) {
     ArrayRef<unsigned> PRCycles = TE.MTM.getProcReleaseAtCycles(getBlockNum());
-    for (unsigned K = 0; K < PRDepths.size(); ++K)
+    for (unsigned K = 0; K != PRDepths.size(); ++K)
       PRMax = std::max(PRMax, PRDepths[K] + PRCycles[K]);
   } else {
     for (unsigned PRD : PRDepths)
@@ -1227,8 +1235,9 @@ unsigned MachineTraceMetrics::Trace::getResourceLength(
   unsigned PRMax = 0;
 
   // Capture computing cycles from extra instructions
-  auto ExtraCycles = [this](ArrayRef<const MCSchedClassDesc *> Instrs,
-                            unsigned ResourceIdx) -> unsigned {
+  auto extraCycles = [this](ArrayRef<const MCSchedClassDesc *> Instrs,
+                            unsigned ResourceIdx)
+                         ->unsigned {
     unsigned Cycles = 0;
     for (const MCSchedClassDesc *SC : Instrs) {
       if (!SC->isValid())
@@ -1246,12 +1255,12 @@ unsigned MachineTraceMetrics::Trace::getResourceLength(
     return Cycles;
   };
 
-  for (unsigned K = 0; K < PRDepths.size(); ++K) {
+  for (unsigned K = 0; K != PRDepths.size(); ++K) {
     unsigned PRCycles = PRDepths[K] + PRHeights[K];
     for (const MachineBasicBlock *MBB : Extrablocks)
       PRCycles += TE.MTM.getProcReleaseAtCycles(MBB->getNumber())[K];
-    PRCycles += ExtraCycles(ExtraInstrs, K);
-    PRCycles -= ExtraCycles(RemoveInstrs, K);
+    PRCycles += extraCycles(ExtraInstrs, K);
+    PRCycles -= extraCycles(RemoveInstrs, K);
     PRMax = std::max(PRMax, PRCycles);
   }
   // Convert to cycle count.
@@ -1283,9 +1292,9 @@ bool MachineTraceMetrics::Trace::isDepInTrace(const MachineInstr &DefMI,
 
 void MachineTraceMetrics::Ensemble::print(raw_ostream &OS) const {
   OS << getName() << " ensemble:\n";
-  for (unsigned Idx = 0; Idx < BlockInfo.size(); ++Idx) {
-    OS << "  %bb." << Idx << '\t';
-    BlockInfo[Idx].print(OS);
+  for (unsigned i = 0, e = BlockInfo.size(); i != e; ++i) {
+    OS << "  %bb." << i << '\t';
+    BlockInfo[i].print(OS);
     OS << '\n';
   }
 }

>From f916cb6184fb04b27ae7b867edcfd162c3a49694 Mon Sep 17 00:00:00 2001
From: "Oleksandr T." <oleksandr.tarasiuk at outlook.com>
Date: Thu, 25 Jul 2024 15:08:18 +0300
Subject: [PATCH 129/259] [Clang] fix assertion failure in invalid delete
 operator declaration check (#99308)

Fixes #96191
---
 clang/docs/ReleaseNotes.rst                    | 1 +
 clang/lib/Sema/SemaExprCXX.cpp                 | 3 +++
 clang/test/SemaCXX/cxx2a-destroying-delete.cpp | 9 +++++++++
 3 files changed, 13 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 0b79e952b48af..9f5ee9c82ab0a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -150,6 +150,7 @@ Bug Fixes to C++ Support
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
 - Fixed a crash when an expression with a dependent ``__typeof__`` type is used as the operand of a unary operator. (#GH97646)
+- Fixed a failed assertion when checking invalid delete operator declaration. (#GH96191)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 14d1f395af90e..51c4a36808fce 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -3806,6 +3806,9 @@ Sema::ActOnCXXDelete(SourceLocation StartLoc, bool UseGlobal,
                                                      Overaligned, DeleteName);
     }
 
+    if (OperatorDelete->isInvalidDecl())
+      return ExprError();
+
     MarkFunctionReferenced(StartLoc, OperatorDelete);
 
     // Check access and ambiguity of destructor if we're going to call it.
diff --git a/clang/test/SemaCXX/cxx2a-destroying-delete.cpp b/clang/test/SemaCXX/cxx2a-destroying-delete.cpp
index 349e6e9538a4c..25b985ef11d15 100644
--- a/clang/test/SemaCXX/cxx2a-destroying-delete.cpp
+++ b/clang/test/SemaCXX/cxx2a-destroying-delete.cpp
@@ -187,3 +187,12 @@ namespace delete_from_new {
 #endif
   }
 }
+
+namespace GH96191 {
+  struct S {};
+  struct T {
+    void operator delete(S) { } // expected-error {{first parameter of 'operator delete' must have type 'void *'}}
+  };
+
+  void foo(T *t) { delete t; }
+}

>From a466db2b32cccfdbd8bbd27cfa2fb51651192075 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett at linaro.org>
Date: Thu, 25 Jul 2024 13:18:11 +0100
Subject: [PATCH 130/259] Revert "[lldb] Fix incorrect logical operator in 'if'
 condition check (NFC)" (#100561)

Reverts llvm/llvm-project#94779

Due to bot failures:
https://lab.llvm.org/buildbot/#/builders/18/builds/1371
---
 .../Python/Interfaces/ScriptedPythonInterface.h | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
index 2667f73516ba3..e1a3156d10afd 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
@@ -85,15 +85,14 @@ class ScriptedPythonInterface : virtual public ScriptedInterface {
     bool has_class_name = !class_name.empty();
     bool has_interpreter_dict =
         !(llvm::StringRef(m_interpreter.GetDictionaryName()).empty());
-
-    if (!has_class_name)
-      return create_error("Missing script class name.");
-
-    if (!has_interpreter_dict)
-      return create_error("Invalid script interpreter dictionary.");
-
-    if (!script_obj)
-      return create_error("Missing scripting object.");
+    if (!has_class_name && !has_interpreter_dict && !script_obj) {
+      if (!has_class_name)
+        return create_error("Missing script class name.");
+      else if (!has_interpreter_dict)
+        return create_error("Invalid script interpreter dictionary.");
+      else
+        return create_error("Missing scripting object.");
+    }
 
     Locker py_lock(&m_interpreter, Locker::AcquireLock | Locker::NoSTDIN,
                    Locker::FreeLock);

>From af1d2b9fb1c6065d3a4611f086b8c83a0c05795c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 25 Jul 2024 16:39:39 +0400
Subject: [PATCH 131/259] CodeGen: Remove -disable-debug-info-print cl::opt
 (#100319)

This was first introduced way back in in 2010 by
6c74a872a8d34d41b751efb68e335cbe91b5a5cc, and has little evidence
of use. Only one test attempts to make use of this, but it's
also redundant since it's also using strip to drop debug info anyway
(and that also makes the test buggy, since it's intended to test
with and without debug info).

The other tests using it were only added to test the option after
discovering it was untested and moved, in later commits.
---
 llvm/lib/CodeGen/MachineModuleInfo.cpp        | 20 +-------
 .../Generic/disable-debug-info-print.ll       | 50 -------------------
 .../X86/disable-debug-info-print-codeview.ll  | 19 -------
 llvm/test/CodeGen/X86/frame-order.ll          |  4 +-
 4 files changed, 4 insertions(+), 89 deletions(-)
 delete mode 100644 llvm/test/CodeGen/Generic/disable-debug-info-print.ll
 delete mode 100644 llvm/test/CodeGen/X86/disable-debug-info-print-codeview.ll

diff --git a/llvm/lib/CodeGen/MachineModuleInfo.cpp b/llvm/lib/CodeGen/MachineModuleInfo.cpp
index b950f4fdbcf79..088e76029f1a3 100644
--- a/llvm/lib/CodeGen/MachineModuleInfo.cpp
+++ b/llvm/lib/CodeGen/MachineModuleInfo.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Constants.h"
@@ -16,25 +14,13 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
-#include <algorithm>
 #include <cassert>
-#include <memory>
-#include <utility>
-#include <vector>
 
 using namespace llvm;
 using namespace llvm::dwarf;
 
-static cl::opt<bool>
-    DisableDebugInfoPrinting("disable-debug-info-print", cl::Hidden,
-                             cl::desc("Disable debug info printing"));
-
 // Out of line virtual method.
 MachineModuleInfoImpl::~MachineModuleInfoImpl() = default;
 
@@ -224,8 +210,7 @@ bool MachineModuleInfoWrapperPass::doInitialization(Module &M) {
         Ctx.diagnose(
             DiagnosticInfoSrcMgr(SMD, M.getName(), IsInlineAsm, LocCookie));
       });
-  MMI.DbgInfoAvailable = !DisableDebugInfoPrinting &&
-                         !M.debug_compile_units().empty();
+  MMI.DbgInfoAvailable = !M.debug_compile_units().empty();
   return false;
 }
 
@@ -250,7 +235,6 @@ MachineModuleAnalysis::run(Module &M, ModuleAnalysisManager &) {
         Ctx.diagnose(
             DiagnosticInfoSrcMgr(SMD, M.getName(), IsInlineAsm, LocCookie));
       });
-  MMI.DbgInfoAvailable =
-      !DisableDebugInfoPrinting && !M.debug_compile_units().empty();
+  MMI.DbgInfoAvailable = !M.debug_compile_units().empty();
   return Result(MMI);
 }
diff --git a/llvm/test/CodeGen/Generic/disable-debug-info-print.ll b/llvm/test/CodeGen/Generic/disable-debug-info-print.ll
deleted file mode 100644
index befa91c15d3c8..0000000000000
--- a/llvm/test/CodeGen/Generic/disable-debug-info-print.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: llc -disable-debug-info-print=true -exception-model=dwarf -o - %s | FileCheck %s
-; RUN: llc -disable-debug-info-print=true -exception-model=sjlj -o - %s | FileCheck %s --check-prefix=SJLJ-CHECK
-
-define i16 @main() nounwind !dbg !7 {
-entry:
-  ret i16 0, !dbg !9
-}
-
-define i16 @helper() !dbg !10 {
-entry:
-  ret i16 0, !dbg !11
-}
-
-
-; CHECK: main
-; CHECK-NOT: cfi_startproc
-; CHECK-NOT: .file
-; CHECK-NOT: .loc
-; CHECK: helper
-; CHECK: cfi_startproc
-; CHECK-NOT: .file
-; CHECK-NOT: .loc
-; CHECK: cfi_endproc
-
-; SJLJ-CHECK: main
-; SJLJ-CHECK-NOT: cfi_startproc
-; SJLJ-CHECK-NOT: .file
-; SJLJ-CHECK-NOT: .loc
-; SJLJ-CHECK: helper
-; SJLJ-CHECK-NOT: cfi_startproc
-; SJLJ-CHECK-NOT: .file
-; SJLJ-CHECK-NOT: .loc
-
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
-!1 = !DIFile(filename: "unwind-tables.c", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 7, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 12.0.0"}
-!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 2, column: 3, scope: !7)
-!10 = distinct !DISubprogram(name: "helper", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
-!11 = !DILocation(line: 2, column: 3, scope: !10)
diff --git a/llvm/test/CodeGen/X86/disable-debug-info-print-codeview.ll b/llvm/test/CodeGen/X86/disable-debug-info-print-codeview.ll
deleted file mode 100644
index 930dafc119b3c..0000000000000
--- a/llvm/test/CodeGen/X86/disable-debug-info-print-codeview.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc -disable-debug-info-print -o - %s | FileCheck %s
-
-; Check that debug info isn't emitted for CodeView with
-; -disable-debug-info-print.
-
-; CHECK-NOT:      CodeViewTypes
-; CHECK-NOT:      CodeViewDebugInfo
-
-source_filename = "empty"
-target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc19.0.24215"
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!2, !3}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "clang", emissionKind: FullDebug)
-!1 = !DIFile(filename: "empty", directory: "path/to")
-!2 = !{i32 2, !"CodeView", i32 1}
-!3 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/llvm/test/CodeGen/X86/frame-order.ll b/llvm/test/CodeGen/X86/frame-order.ll
index 2857a738e8efc..dcbcb481f927c 100644
--- a/llvm/test/CodeGen/X86/frame-order.ll
+++ b/llvm/test/CodeGen/X86/frame-order.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-linux-gnueabi -disable-debug-info-print < %s | FileCheck %s
-; RUN: opt -passes=strip -S < %s | llc -mtriple=x86_64-linux-gnueabi -disable-debug-info-print | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux-gnueabi < %s | FileCheck %s
+; RUN: opt -passes=strip -S < %s | llc -mtriple=x86_64-linux-gnueabi | FileCheck %s
 
 ; This test checks if the code is generated correctly with and without debug info.
 

>From bf76290de48512f59f62eff20d28135c3f918ea5 Mon Sep 17 00:00:00 2001
From: Abid Qadeer <haqadeer at amd.com>
Date: Thu, 25 Jul 2024 13:52:50 +0100
Subject: [PATCH 132/259] [flang][debug] Set scope of internal functions
 correctly. (#99531)

The functions internal to subroutine should have the scope set to the
parent function. This allows a user to evaluate local variables of
parent function when control is stopped in the child.

Fixes #96314
---
 .../lib/Optimizer/Transforms/AddDebugInfo.cpp | 181 ++++++++++--------
 flang/test/Transforms/debug-96314.fir         |  26 +++
 2 files changed, 132 insertions(+), 75 deletions(-)
 create mode 100644 flang/test/Transforms/debug-96314.fir

diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index 577d37cc34b96..3c067bf946cfc 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -66,6 +66,9 @@ class AddDebugInfoPass : public fir::impl::AddDebugInfoBase<AddDebugInfoPass> {
   void handleGlobalOp(fir::GlobalOp glocalOp, mlir::LLVM::DIFileAttr fileAttr,
                       mlir::LLVM::DIScopeAttr scope,
                       mlir::SymbolTable *symbolTable);
+  void handleFuncOp(mlir::func::FuncOp funcOp, mlir::LLVM::DIFileAttr fileAttr,
+                    mlir::LLVM::DICompileUnitAttr cuAttr,
+                    mlir::SymbolTable *symbolTable);
 };
 
 static uint32_t getLineFromLoc(mlir::Location loc) {
@@ -207,11 +210,112 @@ void AddDebugInfoPass::handleGlobalOp(fir::GlobalOp globalOp,
   globalOp->setLoc(builder.getFusedLoc({globalOp->getLoc()}, gvAttr));
 }
 
+void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp,
+                                    mlir::LLVM::DIFileAttr fileAttr,
+                                    mlir::LLVM::DICompileUnitAttr cuAttr,
+                                    mlir::SymbolTable *symbolTable) {
+  mlir::Location l = funcOp->getLoc();
+  // If fused location has already been created then nothing to do
+  // Otherwise, create a fused location.
+  if (debugInfoIsAlreadySet(l))
+    return;
+
+  mlir::ModuleOp module = getOperation();
+  mlir::MLIRContext *context = &getContext();
+  mlir::OpBuilder builder(context);
+  llvm::StringRef fileName(fileAttr.getName());
+  llvm::StringRef filePath(fileAttr.getDirectory());
+  unsigned int CC = (funcOp.getName() == fir::NameUniquer::doProgramEntry())
+                        ? llvm::dwarf::getCallingConvention("DW_CC_program")
+                        : llvm::dwarf::getCallingConvention("DW_CC_normal");
+
+  if (auto funcLoc = mlir::dyn_cast<mlir::FileLineColLoc>(l)) {
+    fileName = llvm::sys::path::filename(funcLoc.getFilename().getValue());
+    filePath = llvm::sys::path::parent_path(funcLoc.getFilename().getValue());
+  }
+
+  mlir::StringAttr fullName = mlir::StringAttr::get(context, funcOp.getName());
+  mlir::Attribute attr = funcOp->getAttr(fir::getInternalFuncNameAttrName());
+  mlir::StringAttr funcName =
+      (attr) ? mlir::cast<mlir::StringAttr>(attr)
+             : mlir::StringAttr::get(context, funcOp.getName());
+
+  auto result = fir::NameUniquer::deconstruct(funcName);
+  funcName = mlir::StringAttr::get(context, result.second.name);
+
+  llvm::SmallVector<mlir::LLVM::DITypeAttr> types;
+  fir::DebugTypeGenerator typeGen(module);
+  for (auto resTy : funcOp.getResultTypes()) {
+    auto tyAttr = typeGen.convertType(resTy, fileAttr, cuAttr, funcOp.getLoc());
+    types.push_back(tyAttr);
+  }
+  for (auto inTy : funcOp.getArgumentTypes()) {
+    auto tyAttr = typeGen.convertType(fir::unwrapRefType(inTy), fileAttr,
+                                      cuAttr, funcOp.getLoc());
+    types.push_back(tyAttr);
+  }
+
+  mlir::LLVM::DISubroutineTypeAttr subTypeAttr =
+      mlir::LLVM::DISubroutineTypeAttr::get(context, CC, types);
+  mlir::LLVM::DIFileAttr funcFileAttr =
+      mlir::LLVM::DIFileAttr::get(context, fileName, filePath);
+
+  // Only definitions need a distinct identifier and a compilation unit.
+  mlir::DistinctAttr id;
+  mlir::LLVM::DIScopeAttr Scope = fileAttr;
+  mlir::LLVM::DICompileUnitAttr compilationUnit;
+  mlir::LLVM::DISubprogramFlags subprogramFlags =
+      mlir::LLVM::DISubprogramFlags{};
+  if (isOptimized)
+    subprogramFlags = mlir::LLVM::DISubprogramFlags::Optimized;
+  if (!funcOp.isExternal()) {
+    id = mlir::DistinctAttr::create(mlir::UnitAttr::get(context));
+    compilationUnit = cuAttr;
+    subprogramFlags =
+        subprogramFlags | mlir::LLVM::DISubprogramFlags::Definition;
+  }
+  unsigned line = getLineFromLoc(l);
+  if (fir::isInternalProcedure(funcOp)) {
+    // For contained functions, the scope is the parent subroutine.
+    mlir::SymbolRefAttr sym = mlir::cast<mlir::SymbolRefAttr>(
+        funcOp->getAttr(fir::getHostSymbolAttrName()));
+    if (sym) {
+      if (auto func =
+              symbolTable->lookup<mlir::func::FuncOp>(sym.getLeafReference())) {
+        // Make sure that parent is processed.
+        handleFuncOp(func, fileAttr, cuAttr, symbolTable);
+        if (auto fusedLoc =
+                mlir::dyn_cast_if_present<mlir::FusedLoc>(func.getLoc())) {
+          if (auto spAttr =
+                  mlir::dyn_cast_if_present<mlir::LLVM::DISubprogramAttr>(
+                      fusedLoc.getMetadata()))
+            Scope = spAttr;
+        }
+      }
+    }
+  } else if (!result.second.modules.empty()) {
+    Scope = getOrCreateModuleAttr(result.second.modules[0], fileAttr, cuAttr,
+                                  line - 1, false);
+  }
+
+  auto spAttr = mlir::LLVM::DISubprogramAttr::get(
+      context, id, compilationUnit, Scope, funcName, fullName, funcFileAttr,
+      line, line, subprogramFlags, subTypeAttr);
+  funcOp->setLoc(builder.getFusedLoc({funcOp->getLoc()}, spAttr));
+
+  // Don't process variables if user asked for line tables only.
+  if (debugLevel == mlir::LLVM::DIEmissionKind::LineTablesOnly)
+    return;
+
+  funcOp.walk([&](fir::cg::XDeclareOp declOp) {
+    handleDeclareOp(declOp, fileAttr, spAttr, typeGen, symbolTable);
+  });
+}
+
 void AddDebugInfoPass::runOnOperation() {
   mlir::ModuleOp module = getOperation();
   mlir::MLIRContext *context = &getContext();
   mlir::SymbolTable symbolTable(module);
-  mlir::OpBuilder builder(context);
   llvm::StringRef fileName;
   std::string filePath;
   // We need 2 type of file paths here.
@@ -248,80 +352,7 @@ void AddDebugInfoPass::runOnOperation() {
       isOptimized, debugLevel);
 
   module.walk([&](mlir::func::FuncOp funcOp) {
-    mlir::Location l = funcOp->getLoc();
-    // If fused location has already been created then nothing to do
-    // Otherwise, create a fused location.
-    if (debugInfoIsAlreadySet(l))
-      return;
-
-    unsigned int CC = (funcOp.getName() == fir::NameUniquer::doProgramEntry())
-                          ? llvm::dwarf::getCallingConvention("DW_CC_program")
-                          : llvm::dwarf::getCallingConvention("DW_CC_normal");
-
-    if (auto funcLoc = mlir::dyn_cast<mlir::FileLineColLoc>(l)) {
-      fileName = llvm::sys::path::filename(funcLoc.getFilename().getValue());
-      filePath = llvm::sys::path::parent_path(funcLoc.getFilename().getValue());
-    }
-
-    mlir::StringAttr fullName =
-        mlir::StringAttr::get(context, funcOp.getName());
-    mlir::Attribute attr = funcOp->getAttr(fir::getInternalFuncNameAttrName());
-    mlir::StringAttr funcName =
-        (attr) ? mlir::cast<mlir::StringAttr>(attr)
-               : mlir::StringAttr::get(context, funcOp.getName());
-
-    auto result = fir::NameUniquer::deconstruct(funcName);
-    funcName = mlir::StringAttr::get(context, result.second.name);
-
-    llvm::SmallVector<mlir::LLVM::DITypeAttr> types;
-    fir::DebugTypeGenerator typeGen(module);
-    for (auto resTy : funcOp.getResultTypes()) {
-      auto tyAttr =
-          typeGen.convertType(resTy, fileAttr, cuAttr, funcOp.getLoc());
-      types.push_back(tyAttr);
-    }
-    for (auto inTy : funcOp.getArgumentTypes()) {
-      auto tyAttr = typeGen.convertType(fir::unwrapRefType(inTy), fileAttr,
-                                        cuAttr, funcOp.getLoc());
-      types.push_back(tyAttr);
-    }
-
-    mlir::LLVM::DISubroutineTypeAttr subTypeAttr =
-        mlir::LLVM::DISubroutineTypeAttr::get(context, CC, types);
-    mlir::LLVM::DIFileAttr funcFileAttr =
-        mlir::LLVM::DIFileAttr::get(context, fileName, filePath);
-
-    // Only definitions need a distinct identifier and a compilation unit.
-    mlir::DistinctAttr id;
-    mlir::LLVM::DIScopeAttr Scope = fileAttr;
-    mlir::LLVM::DICompileUnitAttr compilationUnit;
-    mlir::LLVM::DISubprogramFlags subprogramFlags =
-        mlir::LLVM::DISubprogramFlags{};
-    if (isOptimized)
-      subprogramFlags = mlir::LLVM::DISubprogramFlags::Optimized;
-    if (!funcOp.isExternal()) {
-      id = mlir::DistinctAttr::create(mlir::UnitAttr::get(context));
-      compilationUnit = cuAttr;
-      subprogramFlags =
-          subprogramFlags | mlir::LLVM::DISubprogramFlags::Definition;
-    }
-    unsigned line = getLineFromLoc(l);
-    if (!result.second.modules.empty())
-      Scope = getOrCreateModuleAttr(result.second.modules[0], fileAttr, cuAttr,
-                                    line - 1, false);
-
-    auto spAttr = mlir::LLVM::DISubprogramAttr::get(
-        context, id, compilationUnit, Scope, funcName, fullName, funcFileAttr,
-        line, line, subprogramFlags, subTypeAttr);
-    funcOp->setLoc(builder.getFusedLoc({funcOp->getLoc()}, spAttr));
-
-    // Don't process variables if user asked for line tables only.
-    if (debugLevel == mlir::LLVM::DIEmissionKind::LineTablesOnly)
-      return;
-
-    funcOp.walk([&](fir::cg::XDeclareOp declOp) {
-      handleDeclareOp(declOp, fileAttr, spAttr, typeGen, &symbolTable);
-    });
+    handleFuncOp(funcOp, fileAttr, cuAttr, &symbolTable);
   });
   // Process any global which was not processed through DeclareOp.
   if (debugLevel == mlir::LLVM::DIEmissionKind::Full) {
diff --git a/flang/test/Transforms/debug-96314.fir b/flang/test/Transforms/debug-96314.fir
new file mode 100644
index 0000000000000..e2d0f24a1105c
--- /dev/null
+++ b/flang/test/Transforms/debug-96314.fir
@@ -0,0 +1,26 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s -o - | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
+  func.func @_QMhelperPmod_sub(%arg0: !fir.ref<i32> {fir.bindc_name = "a"} ) {
+    return
+  } loc(#loc1)
+  func.func private @_QMhelperFmod_subPchild1(%arg0: !fir.ref<i32> {fir.bindc_name = "b"} ) attributes {fir.host_symbol = @_QMhelperPmod_sub, llvm.linkage = #llvm.linkage<internal>} {
+    return
+  } loc(#loc2)
+  func.func @global_sub_(%arg0: !fir.ref<i32> {fir.bindc_name = "n"} ) attributes {fir.internal_name = "_QPglobal_sub"} {
+    return
+  } loc(#loc3)
+  func.func private @_QFglobal_subPchild2(%arg0: !fir.ref<i32> {fir.bindc_name = "c"}) attributes {fir.host_symbol = @global_sub_, llvm.linkage = #llvm.linkage<internal>} {
+    return
+  } loc(#loc4)
+}
+
+#loc1 = loc("test.f90":5:1)
+#loc2 = loc("test.f90":15:1)
+#loc3 = loc("test.f90":25:1)
+#loc4 = loc("test.f90":35:1)
+
+// CHECK-DAG: #[[SP1:.*]] = #llvm.di_subprogram<{{.*}}name = "mod_sub"{{.*}}>
+// CHECK-DAG: #llvm.di_subprogram<{{.*}}scope = #[[SP1]], name = "child1"{{.*}}>
+// CHECK-DAG: #[[SP2:.*]] = #llvm.di_subprogram<{{.*}}linkageName = "global_sub_"{{.*}}>
+// CHECK-DAG: #llvm.di_subprogram<{{.*}}scope = #[[SP2]], name = "child2"{{.*}}>

>From 0fedfd83d75415837eb91f56ec24f4b392bf6c57 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Thu, 25 Jul 2024 13:35:16 +0200
Subject: [PATCH 133/259] [clang][Interp][NFC] Protect getPtrBase{,Pop} ops
 from past-end ptrs

---
 clang/lib/AST/Interp/Interp.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 96c59dd14c04a..81c1b6a52189c 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -1571,7 +1571,10 @@ inline bool GetPtrBase(InterpState &S, CodePtr OpPC, uint32_t Off) {
     return false;
   if (!CheckSubobject(S, OpPC, Ptr, CSK_Base))
     return false;
-  S.Stk.push<Pointer>(Ptr.atField(Off));
+  const Pointer &Result = Ptr.atField(Off);
+  if (Result.isPastEnd())
+    return false;
+  S.Stk.push<Pointer>(Result);
   return true;
 }
 
@@ -1581,7 +1584,10 @@ inline bool GetPtrBasePop(InterpState &S, CodePtr OpPC, uint32_t Off) {
     return false;
   if (!CheckSubobject(S, OpPC, Ptr, CSK_Base))
     return false;
-  S.Stk.push<Pointer>(Ptr.atField(Off));
+  const Pointer &Result = Ptr.atField(Off);
+  if (Result.isPastEnd())
+    return false;
+  S.Stk.push<Pointer>(Result);
   return true;
 }
 

>From dfeb3991fb489a703f631ab0c34b58f80568038d Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight at google.com>
Date: Thu, 25 Jul 2024 09:19:22 -0400
Subject: [PATCH 134/259] Remove the `x86_mmx` IR type. (#98505)

It is now translated to `<1 x i64>`, which allows the removal of a bunch
of special casing.

This _incompatibly_ changes the ABI of any LLVM IR function with
`x86_mmx` arguments or returns: instead of passing in mmx registers,
they will now be passed via integer registers. However, the real-world
incompatibility caused by this is expected to be minimal, because Clang
never uses the x86_mmx type -- it lowers `__m64` to either `<1 x i64>`
or `double`, depending on ABI.

This change does _not_ eliminate the SelectionDAG `MVT::x86mmx` type.
That type simply no longer corresponds to an IR type, and is used only
by MMX intrinsics and inline-asm operands.

Because SelectionDAGBuilder only knows how to generate the
operands/results of intrinsics based on the IR type, it thus now
generates the intrinsics with the type MVT::v1i64, instead of
MVT::x86mmx. We need to fix this before the DAG LegalizeTypes, and thus
have the X86 backend fix them up in DAGCombine. (This may be a
short-lived hack, if all the MMX intrinsics can be removed in upcoming
changes.)

Works towards issue #98272.
---
 llvm/bindings/ocaml/llvm/llvm.mli             |    4 -
 llvm/bindings/ocaml/llvm/llvm_ocaml.c         |    5 -
 llvm/docs/BitCodeFormat.rst                   |    2 +-
 llvm/docs/LangRef.rst                         |   20 +-
 llvm/docs/ReleaseNotes.rst                    |   15 +
 llvm/include/llvm-c/Core.h                    |   48 +-
 llvm/include/llvm/IR/DataLayout.h             |    1 -
 llvm/include/llvm/IR/Type.h                   |   12 +-
 llvm/lib/Analysis/ConstantFolding.cpp         |    8 +-
 llvm/lib/AsmParser/LLLexer.cpp                |    3 +-
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     |    4 +-
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     |    5 +-
 llvm/lib/CodeGen/ValueTypes.cpp               |    7 +-
 llvm/lib/IR/AsmWriter.cpp                     |    5 +-
 llvm/lib/IR/ConstantFold.cpp                  |    2 +-
 llvm/lib/IR/Core.cpp                          |    8 -
 llvm/lib/IR/DataLayout.cpp                    |    1 -
 llvm/lib/IR/Function.cpp                      |   14 +-
 llvm/lib/IR/Instructions.cpp                  |    9 -
 llvm/lib/IR/LLVMContextImpl.cpp               |    6 +-
 llvm/lib/IR/LLVMContextImpl.h                 |    2 +-
 llvm/lib/IR/Type.cpp                          |   12 -
 .../DirectX/DXILWriter/DXILBitcodeWriter.cpp  |    3 -
 .../Hexagon/HexagonTargetObjectFile.cpp       |    1 -
 llvm/lib/Target/X86/X86CallingConv.td         |   34 -
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   92 +-
 .../Target/X86/X86InstCombineIntrinsic.cpp    |    8 +-
 llvm/lib/Target/X86/X86IntrinsicsInfo.h       |   96 +-
 .../IPO/DeadArgumentElimination.cpp           |    6 +-
 .../InstCombine/InstCombineCasts.cpp          |    7 -
 .../Instrumentation/MemorySanitizer.cpp       |   42 +-
 llvm/test/Assembler/x86mmx.ll                 |    9 -
 llvm/test/Bitcode/bcanalyzer-types.ll         |    6 -
 llvm/test/Bitcode/compatibility-3.6.ll        |    2 +-
 llvm/test/Bitcode/compatibility-3.7.ll        |    2 +-
 llvm/test/Bitcode/compatibility-3.8.ll        |    2 +-
 llvm/test/Bitcode/compatibility-3.9.ll        |    2 +-
 llvm/test/Bitcode/compatibility-4.0.ll        |    2 +-
 llvm/test/Bitcode/compatibility-5.0.ll        |    2 +-
 llvm/test/Bitcode/compatibility-6.0.ll        |    2 +-
 llvm/test/Bitcode/compatibility.ll            |    2 +-
 .../CodeGen/X86/2008-09-05-sinttofp-2xi32.ll  |   21 +-
 llvm/test/CodeGen/X86/avx-vbroadcast.ll       |    8 +-
 llvm/test/CodeGen/X86/avx2-vbroadcast.ll      |   11 +-
 llvm/test/CodeGen/X86/fast-isel-bc.ll         |    9 +-
 .../CodeGen/X86/mmx-arg-passing-x86-64.ll     |   15 +-
 llvm/test/CodeGen/X86/mmx-arg-passing.ll      |   10 +-
 llvm/test/CodeGen/X86/mmx-arith.ll            |   73 +-
 llvm/test/CodeGen/X86/mmx-bitcast-fold.ll     |    2 +-
 llvm/test/CodeGen/X86/mmx-bitcast.ll          |    4 +-
 llvm/test/CodeGen/X86/mmx-cvt.ll              |   98 +-
 llvm/test/CodeGen/X86/mmx-fold-load.ll        |  107 +-
 llvm/test/CodeGen/X86/mmx-intrinsics.ll       |  398 ++--
 llvm/test/CodeGen/X86/pr23246.ll              |    2 +-
 llvm/test/CodeGen/X86/pr29222.ll              |    6 +-
 llvm/test/CodeGen/X86/select-mmx.ll           |   35 +-
 llvm/test/CodeGen/X86/stack-folding-mmx.ll    |  508 ++---
 llvm/test/CodeGen/X86/vec_extract-mmx.ll      |   11 +-
 llvm/test/CodeGen/X86/vec_insert-7.ll         |    3 +-
 llvm/test/CodeGen/X86/vec_insert-mmx.ll       |    8 +-
 .../MemorySanitizer/X86/mmx-intrinsics.ll     | 1667 +++++++++--------
 .../MemorySanitizer/vector_arith.ll           |   12 +-
 .../MemorySanitizer/vector_cvt.ll             |    6 +-
 .../MemorySanitizer/vector_pack.ll            |   15 +-
 .../MemorySanitizer/vector_shift.ll           |   10 +-
 .../Transforms/InstCombine/X86/x86-movmsk.ll  |   14 +-
 .../bitcast-vec-canon-inseltpoison.ll         |   45 -
 .../InstCombine/bitcast-vec-canon.ll          |   44 -
 .../InstSimplify/ConstProp/loads.ll           |   13 -
 llvm/test/Transforms/LoopUnroll/X86/mmx.ll    |   35 -
 .../Transforms/SLPVectorizer/X86/bad_types.ll |   62 -
 llvm/test/Transforms/SROA/pr57796.ll          |    6 +-
 llvm/tools/llvm-c-test/echo.cpp               |    2 -
 llvm/tools/llvm-stress/llvm-stress.cpp        |    8 +-
 llvm/unittests/IR/InstructionsTest.cpp        |    9 +-
 mlir/docs/Dialects/LLVM.md                    |    2 -
 mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h  |    1 -
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp    |    1 -
 mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp |    2 -
 mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp      |    9 +-
 mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp       |    2 -
 mlir/lib/Target/LLVMIR/TypeToLLVM.cpp         |    3 -
 mlir/test/Dialect/LLVMIR/types.mlir           |    2 -
 mlir/test/Target/LLVMIR/llvmir-types.mlir     |    2 -
 84 files changed, 1881 insertions(+), 1933 deletions(-)
 delete mode 100644 llvm/test/Assembler/x86mmx.ll
 delete mode 100644 llvm/test/Transforms/LoopUnroll/X86/mmx.ll

diff --git a/llvm/bindings/ocaml/llvm/llvm.mli b/llvm/bindings/ocaml/llvm/llvm.mli
index c16530d3a70cb..b8a430adf6cf2 100644
--- a/llvm/bindings/ocaml/llvm/llvm.mli
+++ b/llvm/bindings/ocaml/llvm/llvm.mli
@@ -760,10 +760,6 @@ val void_type : llcontext -> lltype
     [llvm::Type::LabelTy]. *)
 val label_type : llcontext -> lltype
 
-(** [x86_mmx_type c] returns the x86 64-bit MMX register type in the
-    context [c]. See [llvm::Type::X86_MMXTy]. *)
-val x86_mmx_type : llcontext -> lltype
-
 (** [type_by_name m name] returns the specified type from the current module
     if it exists.
     See the method [llvm::Module::getTypeByName] *)
diff --git a/llvm/bindings/ocaml/llvm/llvm_ocaml.c b/llvm/bindings/ocaml/llvm/llvm_ocaml.c
index 4ac824cd6a98a..5906f427e6907 100644
--- a/llvm/bindings/ocaml/llvm/llvm_ocaml.c
+++ b/llvm/bindings/ocaml/llvm/llvm_ocaml.c
@@ -686,11 +686,6 @@ value llvm_label_type(value Context) {
   return to_val(LLVMLabelTypeInContext(Context_val(Context)));
 }
 
-/* llcontext -> lltype */
-value llvm_x86_mmx_type(value Context) {
-  return to_val(LLVMX86MMXTypeInContext(Context_val(Context)));
-}
-
 /* llmodule -> string -> lltype option */
 value llvm_type_by_name(value M, value Name) {
   return ptr_to_option(LLVMGetTypeByName(Module_val(M), String_val(Name)));
diff --git a/llvm/docs/BitCodeFormat.rst b/llvm/docs/BitCodeFormat.rst
index 46af2e421a258..1a724a58f58e0 100644
--- a/llvm/docs/BitCodeFormat.rst
+++ b/llvm/docs/BitCodeFormat.rst
@@ -1291,7 +1291,7 @@ TYPE_CODE_X86_MMX Record
 
 ``[X86_MMX]``
 
-The ``X86_MMX`` record (code 17) adds an ``x86_mmx`` type to the type table.
+The ``X86_MMX`` record (code 17) is deprecated, and imported as a <1 x i64> vector.
 
 TYPE_CODE_STRUCT_ANON Record
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index cd86156ec816f..474a86a02c2ed 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -3945,24 +3945,6 @@ or constants of this type.
       x86_amx
 
 
-X86_mmx Type
-""""""""""""
-
-:Overview:
-
-The x86_mmx type represents a value held in an MMX register on an x86
-machine. The operations allowed on it are quite limited: parameters and
-return values, load and store, and bitcast. User-specified MMX
-instructions are represented as intrinsic or asm calls with arguments
-and/or results of this type. There are no arrays, vectors or constants
-of this type.
-
-:Syntax:
-
-::
-
-      x86_mmx
-
 
 .. _t_pointer:
 
@@ -4396,7 +4378,7 @@ represented by ``0xH`` followed by 4 hexadecimal digits. The bfloat 16-bit
 format is represented by ``0xR`` followed by 4 hexadecimal digits. All
 hexadecimal formats are big-endian (sign bit at the left).
 
-There are no constants of type x86_mmx and x86_amx.
+There are no constants of type x86_amx.
 
 .. _complexconstants:
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index c6a4a74a220af..551a9bec3b916 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -50,6 +50,9 @@ Update on required toolchains to build LLVM
 Changes to the LLVM IR
 ----------------------
 
+* The ``x86_mmx`` IR type has been removed. It will be translated to
+  the standard vector type ``<1 x i64>`` in bitcode upgrade.
+
 Changes to LLVM infrastructure
 ------------------------------
 
@@ -120,6 +123,12 @@ Changes to the X86 Backend
   encoding. To use optimised NOP filling in a code section, leave off the
   "fillval" argument, i.e. `.balign N`, `.p2align N` or `.align N` respectively.
 
+* Due to the removal of the ``x86_mmx`` IR type, functions with
+  ``x86_mmx`` arguments or return values will use a different,
+  incompatible, calling convention ABI. Such functions are not
+  generally seen in the wild (Clang never generates them!), so this is
+  not expected to result in real-world compatibility problems.
+
 Changes to the OCaml bindings
 -----------------------------
 
@@ -129,6 +138,12 @@ Changes to the Python bindings
 Changes to the C API
 --------------------
 
+* The following symbols are deleted due to the removal of the ``x86_mmx`` IR type:
+
+  * ``LLVMX86_MMXTypeKind``
+  * ``LLVMX86MMXTypeInContext``
+  * ``LLVMX86MMXType``
+
 Changes to the CodeGen infrastructure
 -------------------------------------
 
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 223d8efe57daa..4bde82d816e35 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -146,27 +146,27 @@ typedef enum {
 } LLVMOpcode;
 
 typedef enum {
-  LLVMVoidTypeKind,      /**< type with no size */
-  LLVMHalfTypeKind,      /**< 16 bit floating point type */
-  LLVMFloatTypeKind,     /**< 32 bit floating point type */
-  LLVMDoubleTypeKind,    /**< 64 bit floating point type */
-  LLVMX86_FP80TypeKind,  /**< 80 bit floating point type (X87) */
-  LLVMFP128TypeKind,     /**< 128 bit floating point type (112-bit mantissa)*/
-  LLVMPPC_FP128TypeKind, /**< 128 bit floating point type (two 64-bits) */
-  LLVMLabelTypeKind,     /**< Labels */
-  LLVMIntegerTypeKind,   /**< Arbitrary bit width integers */
-  LLVMFunctionTypeKind,  /**< Functions */
-  LLVMStructTypeKind,    /**< Structures */
-  LLVMArrayTypeKind,     /**< Arrays */
-  LLVMPointerTypeKind,   /**< Pointers */
-  LLVMVectorTypeKind,    /**< Fixed width SIMD vector type */
-  LLVMMetadataTypeKind,  /**< Metadata */
-  LLVMX86_MMXTypeKind,   /**< X86 MMX */
-  LLVMTokenTypeKind,     /**< Tokens */
-  LLVMScalableVectorTypeKind, /**< Scalable SIMD vector type */
-  LLVMBFloatTypeKind,    /**< 16 bit brain floating point type */
-  LLVMX86_AMXTypeKind,   /**< X86 AMX */
-  LLVMTargetExtTypeKind, /**< Target extension type */
+  LLVMVoidTypeKind = 0,     /**< type with no size */
+  LLVMHalfTypeKind = 1,     /**< 16 bit floating point type */
+  LLVMFloatTypeKind = 2,    /**< 32 bit floating point type */
+  LLVMDoubleTypeKind = 3,   /**< 64 bit floating point type */
+  LLVMX86_FP80TypeKind = 4, /**< 80 bit floating point type (X87) */
+  LLVMFP128TypeKind = 5, /**< 128 bit floating point type (112-bit mantissa)*/
+  LLVMPPC_FP128TypeKind = 6, /**< 128 bit floating point type (two 64-bits) */
+  LLVMLabelTypeKind = 7,     /**< Labels */
+  LLVMIntegerTypeKind = 8,   /**< Arbitrary bit width integers */
+  LLVMFunctionTypeKind = 9,  /**< Functions */
+  LLVMStructTypeKind = 10,   /**< Structures */
+  LLVMArrayTypeKind = 11,    /**< Arrays */
+  LLVMPointerTypeKind = 12,  /**< Pointers */
+  LLVMVectorTypeKind = 13,   /**< Fixed width SIMD vector type */
+  LLVMMetadataTypeKind = 14, /**< Metadata */
+                             /* 15 previously used by LLVMX86_MMXTypeKind */
+  LLVMTokenTypeKind = 16,    /**< Tokens */
+  LLVMScalableVectorTypeKind = 17, /**< Scalable SIMD vector type */
+  LLVMBFloatTypeKind = 18,         /**< 16 bit brain floating point type */
+  LLVMX86_AMXTypeKind = 19,        /**< X86 AMX */
+  LLVMTargetExtTypeKind = 20,      /**< Target extension type */
 } LLVMTypeKind;
 
 typedef enum {
@@ -1716,11 +1716,6 @@ LLVMTypeRef LLVMVoidTypeInContext(LLVMContextRef C);
  */
 LLVMTypeRef LLVMLabelTypeInContext(LLVMContextRef C);
 
-/**
- * Create a X86 MMX type in a context.
- */
-LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C);
-
 /**
  * Create a X86 AMX type in a context.
  */
@@ -1742,7 +1737,6 @@ LLVMTypeRef LLVMMetadataTypeInContext(LLVMContextRef C);
  */
 LLVMTypeRef LLVMVoidType(void);
 LLVMTypeRef LLVMLabelType(void);
-LLVMTypeRef LLVMX86MMXType(void);
 LLVMTypeRef LLVMX86AMXType(void);
 
 /**
diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h
index d14adfe1590be..5f7034b5ee36f 100644
--- a/llvm/include/llvm/IR/DataLayout.h
+++ b/llvm/include/llvm/IR/DataLayout.h
@@ -693,7 +693,6 @@ inline TypeSize DataLayout::getTypeSizeInBits(Type *Ty) const {
   case Type::FloatTyID:
     return TypeSize::getFixed(32);
   case Type::DoubleTyID:
-  case Type::X86_MMXTyID:
     return TypeSize::getFixed(64);
   case Type::PPC_FP128TyID:
   case Type::FP128TyID:
diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h
index 1f0133c08e7d6..c74f9e9d24800 100644
--- a/llvm/include/llvm/IR/Type.h
+++ b/llvm/include/llvm/IR/Type.h
@@ -63,7 +63,6 @@ class Type {
     VoidTyID,      ///< type with no size
     LabelTyID,     ///< Labels
     MetadataTyID,  ///< Metadata
-    X86_MMXTyID,   ///< MMX vectors (64 bits, X86 specific)
     X86_AMXTyID,   ///< AMX vectors (8192 bits, X86 specific)
     TokenTyID,     ///< Tokens
 
@@ -197,9 +196,6 @@ class Type {
 
   const fltSemantics &getFltSemantics() const;
 
-  /// Return true if this is X86 MMX.
-  bool isX86_MMXTy() const { return getTypeID() == X86_MMXTyID; }
-
   /// Return true if this is X86 AMX.
   bool isX86_AMXTy() const { return getTypeID() == X86_AMXTyID; }
 
@@ -285,8 +281,8 @@ class Type {
   /// Return true if the type is a valid type for a register in codegen. This
   /// includes all first-class types except struct and array types.
   bool isSingleValueType() const {
-    return isFloatingPointTy() || isX86_MMXTy() || isIntegerTy() ||
-           isPointerTy() || isVectorTy() || isX86_AMXTy() || isTargetExtTy();
+    return isFloatingPointTy() || isIntegerTy() || isPointerTy() ||
+           isVectorTy() || isX86_AMXTy() || isTargetExtTy();
   }
 
   /// Return true if the type is an aggregate type. This means it is valid as
@@ -302,8 +298,7 @@ class Type {
   bool isSized(SmallPtrSetImpl<Type*> *Visited = nullptr) const {
     // If it's a primitive, it is always sized.
     if (getTypeID() == IntegerTyID || isFloatingPointTy() ||
-        getTypeID() == PointerTyID || getTypeID() == X86_MMXTyID ||
-        getTypeID() == X86_AMXTyID)
+        getTypeID() == PointerTyID || getTypeID() == X86_AMXTyID)
       return true;
     // If it is not something that can have a size (e.g. a function or label),
     // it doesn't have a size.
@@ -453,7 +448,6 @@ class Type {
   static Type *getX86_FP80Ty(LLVMContext &C);
   static Type *getFP128Ty(LLVMContext &C);
   static Type *getPPC_FP128Ty(LLVMContext &C);
-  static Type *getX86_MMXTy(LLVMContext &C);
   static Type *getX86_AMXTy(LLVMContext &C);
   static Type *getTokenTy(LLVMContext &C);
   static IntegerType *getIntNTy(LLVMContext &C, unsigned N);
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index df75745645e04..85ee231675d06 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -564,16 +564,14 @@ Constant *FoldReinterpretLoadFromConst(Constant *C, Type *LoadTy,
     Type *MapTy = Type::getIntNTy(C->getContext(),
                                   DL.getTypeSizeInBits(LoadTy).getFixedValue());
     if (Constant *Res = FoldReinterpretLoadFromConst(C, MapTy, Offset, DL)) {
-      if (Res->isNullValue() && !LoadTy->isX86_MMXTy() &&
-          !LoadTy->isX86_AMXTy())
+      if (Res->isNullValue() && !LoadTy->isX86_AMXTy())
         // Materializing a zero can be done trivially without a bitcast
         return Constant::getNullValue(LoadTy);
       Type *CastTy = LoadTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(LoadTy) : LoadTy;
       Res = FoldBitCast(Res, CastTy, DL);
       if (LoadTy->isPtrOrPtrVectorTy()) {
         // For vector of pointer, we needed to first convert to a vector of integer, then do vector inttoptr
-        if (Res->isNullValue() && !LoadTy->isX86_MMXTy() &&
-            !LoadTy->isX86_AMXTy())
+        if (Res->isNullValue() && !LoadTy->isX86_AMXTy())
           return Constant::getNullValue(LoadTy);
         if (DL.isNonIntegralPointerType(LoadTy->getScalarType()))
           // Be careful not to replace a load of an addrspace value with an inttoptr here
@@ -764,7 +762,7 @@ Constant *llvm::ConstantFoldLoadFromUniformValue(Constant *C, Type *Ty,
   // uniform.
   if (!DL.typeSizeEqualsStoreSize(C->getType()))
     return nullptr;
-  if (C->isNullValue() && !Ty->isX86_MMXTy() && !Ty->isX86_AMXTy())
+  if (C->isNullValue() && !Ty->isX86_AMXTy())
     return Constant::getNullValue(Ty);
   if (C->isAllOnesValue() &&
       (Ty->isIntOrIntVectorTy() || Ty->isFPOrFPVectorTy()))
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 7d7fe19568e8a..c82e74972b67c 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -838,7 +838,8 @@ lltok::Kind LLLexer::LexIdentifier() {
   TYPEKEYWORD("ppc_fp128", Type::getPPC_FP128Ty(Context));
   TYPEKEYWORD("label",     Type::getLabelTy(Context));
   TYPEKEYWORD("metadata",  Type::getMetadataTy(Context));
-  TYPEKEYWORD("x86_mmx",   Type::getX86_MMXTy(Context));
+  TYPEKEYWORD("x86_mmx", llvm::FixedVectorType::get(
+                             llvm::IntegerType::get(Context, 64), 1));
   TYPEKEYWORD("x86_amx",   Type::getX86_AMXTy(Context));
   TYPEKEYWORD("token",     Type::getTokenTy(Context));
   TYPEKEYWORD("ptr",       PointerType::getUnqual(Context));
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 84d624f6cf8fa..fd4ae109b4bb8 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2496,7 +2496,9 @@ Error BitcodeReader::parseTypeTableBody() {
       ResultTy = Type::getMetadataTy(Context);
       break;
     case bitc::TYPE_CODE_X86_MMX:   // X86_MMX
-      ResultTy = Type::getX86_MMXTy(Context);
+      // Deprecated: decodes as <1 x i64>
+      ResultTy =
+          llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1);
       break;
     case bitc::TYPE_CODE_X86_AMX:   // X86_AMX
       ResultTy = Type::getX86_AMXTy(Context);
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 324dcbca8137e..52e15e6880ef2 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1088,8 +1088,9 @@ void ModuleBitcodeWriter::writeTypeTable() {
     case Type::FP128TyID:     Code = bitc::TYPE_CODE_FP128;     break;
     case Type::PPC_FP128TyID: Code = bitc::TYPE_CODE_PPC_FP128; break;
     case Type::LabelTyID:     Code = bitc::TYPE_CODE_LABEL;     break;
-    case Type::MetadataTyID:  Code = bitc::TYPE_CODE_METADATA;  break;
-    case Type::X86_MMXTyID:   Code = bitc::TYPE_CODE_X86_MMX;   break;
+    case Type::MetadataTyID:
+      Code = bitc::TYPE_CODE_METADATA;
+      break;
     case Type::X86_AMXTyID:   Code = bitc::TYPE_CODE_X86_AMX;   break;
     case Type::TokenTyID:     Code = bitc::TYPE_CODE_TOKEN;     break;
     case Type::IntegerTyID:
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index b0f736a49c20e..0c6b726a28a24 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -207,7 +207,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
     assert(isExtended() && "Type is not extended!");
     return LLVMTy;
   case MVT::isVoid:  return Type::getVoidTy(Context);
-  case MVT::x86mmx:  return Type::getX86_MMXTy(Context);
+  case MVT::x86mmx:  return llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1);
   case MVT::aarch64svcount:
     return TargetExtType::get(Context, "aarch64.svcount");
   case MVT::x86amx:  return Type::getX86_AMXTy(Context);
@@ -241,8 +241,8 @@ MVT MVT::getVT(Type *Ty, bool HandleUnknown){
   case Type::BFloatTyID:    return MVT(MVT::bf16);
   case Type::FloatTyID:     return MVT(MVT::f32);
   case Type::DoubleTyID:    return MVT(MVT::f64);
-  case Type::X86_FP80TyID:  return MVT(MVT::f80);
-  case Type::X86_MMXTyID:   return MVT(MVT::x86mmx);
+  case Type::X86_FP80TyID:
+    return MVT(MVT::f80);
   case Type::TargetExtTyID: {
     TargetExtType *TargetExtTy = cast<TargetExtType>(Ty);
     if (TargetExtTy->getName() == "aarch64.svcount")
@@ -302,4 +302,3 @@ void MVT::print(raw_ostream &OS) const {
   else
     OS << EVT(*this).getEVTString();
 }
-
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 6599730590de6..01a16ccd688f4 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -573,8 +573,9 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) {
   case Type::FP128TyID:     OS << "fp128"; return;
   case Type::PPC_FP128TyID: OS << "ppc_fp128"; return;
   case Type::LabelTyID:     OS << "label"; return;
-  case Type::MetadataTyID:  OS << "metadata"; return;
-  case Type::X86_MMXTyID:   OS << "x86_mmx"; return;
+  case Type::MetadataTyID:
+    OS << "metadata";
+    return;
   case Type::X86_AMXTyID:   OS << "x86_amx"; return;
   case Type::TokenTyID:     OS << "token"; return;
   case Type::IntegerTyID:
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 693674ae0d06f..05ab0968ef6f3 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -142,7 +142,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
     return UndefValue::get(DestTy);
   }
 
-  if (V->isNullValue() && !DestTy->isX86_MMXTy() && !DestTy->isX86_AMXTy() &&
+  if (V->isNullValue() && !DestTy->isX86_AMXTy() &&
       opc != Instruction::AddrSpaceCast)
     return Constant::getNullValue(DestTy);
 
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index cf7bbf6b2576f..17c0bf72ef05d 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -609,8 +609,6 @@ LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty) {
     return LLVMPointerTypeKind;
   case Type::FixedVectorTyID:
     return LLVMVectorTypeKind;
-  case Type::X86_MMXTyID:
-    return LLVMX86_MMXTypeKind;
   case Type::X86_AMXTyID:
     return LLVMX86_AMXTypeKind;
   case Type::TokenTyID:
@@ -725,9 +723,6 @@ LLVMTypeRef LLVMFP128TypeInContext(LLVMContextRef C) {
 LLVMTypeRef LLVMPPCFP128TypeInContext(LLVMContextRef C) {
   return (LLVMTypeRef) Type::getPPC_FP128Ty(*unwrap(C));
 }
-LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C) {
-  return (LLVMTypeRef) Type::getX86_MMXTy(*unwrap(C));
-}
 LLVMTypeRef LLVMX86AMXTypeInContext(LLVMContextRef C) {
   return (LLVMTypeRef) Type::getX86_AMXTy(*unwrap(C));
 }
@@ -753,9 +748,6 @@ LLVMTypeRef LLVMFP128Type(void) {
 LLVMTypeRef LLVMPPCFP128Type(void) {
   return LLVMPPCFP128TypeInContext(LLVMGetGlobalContext());
 }
-LLVMTypeRef LLVMX86MMXType(void) {
-  return LLVMX86MMXTypeInContext(LLVMGetGlobalContext());
-}
 LLVMTypeRef LLVMX86AMXType(void) {
   return LLVMX86AMXTypeInContext(LLVMGetGlobalContext());
 }
diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp
index 2741165332487..17897f77b4edb 100644
--- a/llvm/lib/IR/DataLayout.cpp
+++ b/llvm/lib/IR/DataLayout.cpp
@@ -835,7 +835,6 @@ Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
     // layout.
     return Align(PowerOf2Ceil(BitWidth / 8));
   }
-  case Type::X86_MMXTyID:
   case Type::FixedVectorTyID:
   case Type::ScalableVectorTyID: {
     unsigned BitWidth = getTypeSizeInBits(Ty).getKnownMinValue();
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 20871982afb06..9b0dd5fca7e0e 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -1052,8 +1052,9 @@ static std::string getMangledTypeStr(Type *Ty, bool &HasUnnamedType) {
     case Type::DoubleTyID:    Result += "f64";      break;
     case Type::X86_FP80TyID:  Result += "f80";      break;
     case Type::FP128TyID:     Result += "f128";     break;
-    case Type::PPC_FP128TyID: Result += "ppcf128";  break;
-    case Type::X86_MMXTyID:   Result += "x86mmx";   break;
+    case Type::PPC_FP128TyID:
+      Result += "ppcf128";
+      break;
     case Type::X86_AMXTyID:   Result += "x86amx";   break;
     case Type::IntegerTyID:
       Result += "i" + utostr(cast<IntegerType>(Ty)->getBitWidth());
@@ -1397,7 +1398,8 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   switch (D.Kind) {
   case IITDescriptor::Void: return Type::getVoidTy(Context);
   case IITDescriptor::VarArg: return Type::getVoidTy(Context);
-  case IITDescriptor::MMX: return Type::getX86_MMXTy(Context);
+  case IITDescriptor::MMX:
+    return llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1);
   case IITDescriptor::AMX: return Type::getX86_AMXTy(Context);
   case IITDescriptor::Token: return Type::getTokenTy(Context);
   case IITDescriptor::Metadata: return Type::getMetadataTy(Context);
@@ -1580,7 +1582,11 @@ static bool matchIntrinsicType(
   switch (D.Kind) {
     case IITDescriptor::Void: return !Ty->isVoidTy();
     case IITDescriptor::VarArg: return true;
-    case IITDescriptor::MMX:  return !Ty->isX86_MMXTy();
+    case IITDescriptor::MMX: {
+      FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
+      return !VT || VT->getNumElements() != 1 ||
+             !VT->getElementType()->isIntegerTy(64);
+    }
     case IITDescriptor::AMX:  return !Ty->isX86_AMXTy();
     case IITDescriptor::Token: return !Ty->isTokenTy();
     case IITDescriptor::Metadata: return !Ty->isMetadataTy();
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 7a8cf8c230498..58ebe7e95cd06 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -3116,9 +3116,6 @@ bool CastInst::isBitCastable(Type *SrcTy, Type *DestTy) {
   if (SrcBits != DestBits)
     return false;
 
-  if (DestTy->isX86_MMXTy() || SrcTy->isX86_MMXTy())
-    return false;
-
   return true;
 }
 
@@ -3228,12 +3225,6 @@ CastInst::getCastOpcode(
       return IntToPtr;                              // int -> ptr
     }
     llvm_unreachable("Casting pointer to other than pointer or int");
-  } else if (DestTy->isX86_MMXTy()) {
-    if (SrcTy->isVectorTy()) {
-      assert(DestBits == SrcBits && "Casting vector of wrong width to X86_MMX");
-      return BitCast;                               // 64-bit vector to MMX
-    }
-    llvm_unreachable("Illegal cast to X86_MMX");
   }
   llvm_unreachable("Casting to type that is not first-class");
 }
diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp
index 0a376179d609c..4f1ef8cec3213 100644
--- a/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/llvm/lib/IR/LLVMContextImpl.cpp
@@ -40,9 +40,9 @@ LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
       FloatTy(C, Type::FloatTyID), DoubleTy(C, Type::DoubleTyID),
       MetadataTy(C, Type::MetadataTyID), TokenTy(C, Type::TokenTyID),
       X86_FP80Ty(C, Type::X86_FP80TyID), FP128Ty(C, Type::FP128TyID),
-      PPC_FP128Ty(C, Type::PPC_FP128TyID), X86_MMXTy(C, Type::X86_MMXTyID),
-      X86_AMXTy(C, Type::X86_AMXTyID), Int1Ty(C, 1), Int8Ty(C, 8),
-      Int16Ty(C, 16), Int32Ty(C, 32), Int64Ty(C, 64), Int128Ty(C, 128) {}
+      PPC_FP128Ty(C, Type::PPC_FP128TyID), X86_AMXTy(C, Type::X86_AMXTyID),
+      Int1Ty(C, 1), Int8Ty(C, 8), Int16Ty(C, 16), Int32Ty(C, 32),
+      Int64Ty(C, 64), Int128Ty(C, 128) {}
 
 LLVMContextImpl::~LLVMContextImpl() {
 #ifndef NDEBUG
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 937a87d686175..8e9ca21d149f6 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -1582,7 +1582,7 @@ class LLVMContextImpl {
   // Basic type instances.
   Type VoidTy, LabelTy, HalfTy, BFloatTy, FloatTy, DoubleTy, MetadataTy,
       TokenTy;
-  Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_MMXTy, X86_AMXTy;
+  Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_AMXTy;
   IntegerType Int1Ty, Int8Ty, Int16Ty, Int32Ty, Int64Ty, Int128Ty;
 
   std::unique_ptr<ConstantTokenNone> TheNoneToken;
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 5c61ad9f000b0..9ddccce7f959c 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -45,7 +45,6 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) {
   case PPC_FP128TyID : return getPPC_FP128Ty(C);
   case LabelTyID     : return getLabelTy(C);
   case MetadataTyID  : return getMetadataTy(C);
-  case X86_MMXTyID   : return getX86_MMXTy(C);
   case X86_AMXTyID   : return getX86_AMXTy(C);
   case TokenTyID     : return getTokenTy(C);
   default:
@@ -125,14 +124,6 @@ bool Type::canLosslesslyBitCastTo(Type *Ty) const {
   if (isa<VectorType>(this) && isa<VectorType>(Ty))
     return getPrimitiveSizeInBits() == Ty->getPrimitiveSizeInBits();
 
-  //  64-bit fixed width vector types can be losslessly converted to x86mmx.
-  if (((isa<FixedVectorType>(this)) && Ty->isX86_MMXTy()) &&
-      getPrimitiveSizeInBits().getFixedValue() == 64)
-    return true;
-  if ((isX86_MMXTy() && isa<FixedVectorType>(Ty)) &&
-      Ty->getPrimitiveSizeInBits().getFixedValue() == 64)
-    return true;
-
   //  8192-bit fixed width vector types can be losslessly converted to x86amx.
   if (((isa<FixedVectorType>(this)) && Ty->isX86_AMXTy()) &&
       getPrimitiveSizeInBits().getFixedValue() == 8192)
@@ -179,8 +170,6 @@ TypeSize Type::getPrimitiveSizeInBits() const {
     return TypeSize::getFixed(128);
   case Type::PPC_FP128TyID:
     return TypeSize::getFixed(128);
-  case Type::X86_MMXTyID:
-    return TypeSize::getFixed(64);
   case Type::X86_AMXTyID:
     return TypeSize::getFixed(8192);
   case Type::IntegerTyID:
@@ -245,7 +234,6 @@ Type *Type::getTokenTy(LLVMContext &C) { return &C.pImpl->TokenTy; }
 Type *Type::getX86_FP80Ty(LLVMContext &C) { return &C.pImpl->X86_FP80Ty; }
 Type *Type::getFP128Ty(LLVMContext &C) { return &C.pImpl->FP128Ty; }
 Type *Type::getPPC_FP128Ty(LLVMContext &C) { return &C.pImpl->PPC_FP128Ty; }
-Type *Type::getX86_MMXTy(LLVMContext &C) { return &C.pImpl->X86_MMXTy; }
 Type *Type::getX86_AMXTy(LLVMContext &C) { return &C.pImpl->X86_AMXTy; }
 
 IntegerType *Type::getInt1Ty(LLVMContext &C) { return &C.pImpl->Int1Ty; }
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 3433408f05171..cd0d6d34e9a67 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -1048,9 +1048,6 @@ void DXILBitcodeWriter::writeTypeTable() {
     case Type::MetadataTyID:
       Code = bitc::TYPE_CODE_METADATA;
       break;
-    case Type::X86_MMXTyID:
-      Code = bitc::TYPE_CODE_X86_MMX;
-      break;
     case Type::IntegerTyID:
       // INTEGER: [width]
       Code = bitc::TYPE_CODE_INTEGER;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index e5d10a75728bf..0c1b0aea41f41 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -329,7 +329,6 @@ unsigned HexagonTargetObjectFile::getSmallestAddressableSize(const Type *Ty,
   case Type::PPC_FP128TyID:
   case Type::LabelTyID:
   case Type::MetadataTyID:
-  case Type::X86_MMXTyID:
   case Type::X86_AMXTyID:
   case Type::TokenTyID:
   case Type::TypedPointerTyID:
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 9ec68bfb8e0f7..c55ff3dfc9c8e 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -168,10 +168,6 @@ def CC_#NAME : CallingConv<[
     CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
     CCIfType<[i64, f64], CCAssignToStack<8, 4>>,
 
-    // MMX type gets 8 byte slot in stack , while alignment depends on target
-    CCIfSubtarget<"is64Bit()", CCIfType<[x86mmx], CCAssignToStack<8, 8>>>,
-    CCIfType<[x86mmx], CCAssignToStack<8, 4>>,
-
     // float 128 get stack slots whose size and alignment depends 
     // on the subtarget.
     CCIfType<[f80, f128], CCAssignToStack<0, 0>>,
@@ -286,10 +282,6 @@ def RetCC_X86Common : CallingConv<[
   CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
             CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
 
-  // MMX vector types are always returned in MM0. If the target doesn't have
-  // MM0, it doesn't support these vector types.
-  CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
-
   // Long double types are always returned in FP0 (even with SSE),
   // except on Win64.
   CCIfNotSubtarget<"isTargetWin64()", CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>>
@@ -376,9 +368,6 @@ def RetCC_X86_64_C : CallingConv<[
   CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
   CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>,
 
-  // MMX vector types are always returned in XMM0.
-  CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
-
   // Pointers are always returned in full 64-bit registers.
   CCIfPtr<CCCustom<"CC_X86_64_Pointer">>,
 
@@ -389,9 +378,6 @@ def RetCC_X86_64_C : CallingConv<[
 
 // X86-Win64 C return-value convention.
 def RetCC_X86_Win64_C : CallingConv<[
-  // The X86-Win64 calling convention always returns __m64 values in RAX.
-  CCIfType<[x86mmx], CCBitConvertToType<i64>>,
-
   // GCC returns FP values in RAX on Win64.
   CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i32>>>,
   CCIfType<[f64], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i64>>>,
@@ -436,8 +422,6 @@ def RetCC_X86_64_Swift : CallingConv<[
   CCIfType<[f64], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
   CCIfType<[f128], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
 
-  // MMX vector types are returned in XMM0, XMM1, XMM2 and XMM3.
-  CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
   CCDelegateTo<RetCC_X86Common>
 ]>;
 
@@ -572,12 +556,6 @@ def CC_X86_64_C : CallingConv<[
 
   CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
 
-  // The first 8 MMX vector arguments are passed in XMM registers on Darwin.
-  CCIfType<[x86mmx],
-            CCIfSubtarget<"isTargetDarwin()",
-            CCIfSubtarget<"hasSSE2()",
-            CCPromoteToType<v2i64>>>>,
-
   // Boolean vectors of AVX-512 are passed in SIMD registers.
   // The call from AVX to AVX-512 function should work,
   // since the boolean types in AVX/AVX2 are promoted by default.
@@ -666,9 +644,6 @@ def CC_X86_Win64_C : CallingConv<[
   // Long doubles are passed by pointer
   CCIfType<[f80], CCPassIndirect<i64>>,
 
-  // The first 4 MMX vector arguments are passed in GPRs.
-  CCIfType<[x86mmx], CCBitConvertToType<i64>>,
-
   // If SSE was disabled, pass FP values smaller than 64-bits as integers in
   // GPRs or on the stack.
   CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i32>>>,
@@ -843,11 +818,6 @@ def CC_X86_32_Common : CallingConv<[
 
   CCIfNotVarArg<CCIfInReg<CCIfType<[f16], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
 
-  // The first 3 __m64 vector arguments are passed in mmx registers if the
-  // call is not a vararg call.
-  CCIfNotVarArg<CCIfType<[x86mmx],
-                CCAssignToReg<[MM0, MM1, MM2]>>>,
-
   CCIfType<[f16], CCAssignToStack<4, 4>>,
 
   // Integer/Float values get stored in stack slots that are 4 bytes in
@@ -870,10 +840,6 @@ def CC_X86_32_Common : CallingConv<[
   CCIfType<[v32i1], CCPromoteToType<v32i8>>,
   CCIfType<[v64i1], CCPromoteToType<v64i8>>,
 
-  // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
-  // passed in the parameter area.
-  CCIfType<[x86mmx], CCAssignToStack<8, 4>>,
-
   // Darwin passes vectors in a form that differs from the i386 psABI
   CCIfSubtarget<"isTargetDarwin()", CCDelegateTo<CC_X86_32_Vector_Darwin>>,
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7f2c98db83304..ad59b13933a6a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2554,7 +2554,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                        ISD::FP_EXTEND,
                        ISD::STRICT_FP_EXTEND,
                        ISD::FP_ROUND,
-                       ISD::STRICT_FP_ROUND});
+                       ISD::STRICT_FP_ROUND,
+                       ISD::INTRINSIC_VOID,
+                       ISD::INTRINSIC_WO_CHAIN,
+                       ISD::INTRINSIC_W_CHAIN});
 
   computeRegisterProperties(Subtarget.getRegisterInfo());
 
@@ -27269,6 +27272,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       llvm_unreachable("Unsupported truncstore intrinsic");
     }
   }
+  case INTR_TYPE_CAST_MMX:
+    return SDValue(); // handled in combineINTRINSIC_*
   }
 }
 
@@ -57761,6 +57766,86 @@ static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
+// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
+// use x86mmx instead.
+static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG) {
+  SDLoc dl(N);
+
+  bool MadeChange = false, CastReturnVal = false;
+  SmallVector<SDValue, 8> Args;
+  for (const SDValue &Arg : N->op_values()) {
+    if (Arg.getValueType() == MVT::v1i64) {
+      MadeChange = true;
+      Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
+    } else
+      Args.push_back(Arg);
+  }
+  SDVTList VTs = N->getVTList();
+  SDVTList NewVTs = VTs;
+  if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
+    SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
+    NewVTArr[0] = MVT::x86mmx;
+    NewVTs = DAG.getVTList(NewVTArr);
+    MadeChange = true;
+    CastReturnVal = true;
+  }
+
+  if (MadeChange) {
+    SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
+    if (CastReturnVal) {
+      SmallVector<SDValue, 2> Returns;
+      for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
+        Returns.push_back(Result.getValue(i));
+      Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
+      return DAG.getMergeValues(Returns, dl);
+    }
+    return Result;
+  }
+  return SDValue();
+}
+static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  unsigned IntNo = N->getConstantOperandVal(0);
+  const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
+
+  if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
+    return FixupMMXIntrinsicTypes(N, DAG);
+
+  return SDValue();
+}
+
+static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  unsigned IntNo = N->getConstantOperandVal(1);
+  const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
+
+  if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
+    return FixupMMXIntrinsicTypes(N, DAG);
+
+  return SDValue();
+}
+
+static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  unsigned IntNo = N->getConstantOperandVal(1);
+  const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
+
+  if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
+    return FixupMMXIntrinsicTypes(N, DAG);
+
+  return SDValue();
+}
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -57951,7 +58036,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
   case X86ISD::MOVDQ2Q:     return combineMOVDQ2Q(N, DAG);
   case X86ISD::PDEP:        return combinePDEP(N, DAG, DCI);
-  // clang-format on
+  case ISD::INTRINSIC_WO_CHAIN:  return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
+  case ISD::INTRINSIC_W_CHAIN:  return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
+  case ISD::INTRINSIC_VOID:  return combineINTRINSIC_VOID(N, DAG, DCI);
+    // clang-format on
   }
 
   return SDValue();
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 322cb6f6f5819..793d62ba2a8e7 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -623,11 +623,13 @@ static Value *simplifyX86movmsk(const IntrinsicInst &II,
   if (isa<UndefValue>(Arg))
     return Constant::getNullValue(ResTy);
 
-  auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
-  // We can't easily peek through x86_mmx types.
-  if (!ArgTy)
+  // Preserve previous behavior and give up.
+  // TODO: treat as <8 x i8>.
+  if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb)
     return nullptr;
 
+  auto *ArgTy = cast<FixedVectorType>(Arg->getType());
+
   // Expand MOVMSK to compare/bitcast/zext:
   // e.g. PMOVMSKB(v16i8 x):
   // %cmp = icmp slt <16 x i8> %x, zeroinitializer
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 717541cf6c559..685daca360e08 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -73,7 +73,8 @@ enum IntrinsicType : uint16_t {
   GATHER_AVX2,
   ROUNDP,
   ROUNDS,
-  RDPRU
+  RDPRU,
+  INTR_TYPE_CAST_MMX
 };
 
 struct IntrinsicData {
@@ -323,6 +324,8 @@ static const IntrinsicData IntrinsicsWithChain[] = {
     X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, 0, 0),
     X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, 0, 0),
     X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, 0, 0),
+    X86_INTRINSIC_DATA(mmx_maskmovq, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_movnt_dq, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(rdpmc, RDPMC, X86::RDPMC, 0),
     X86_INTRINSIC_DATA(rdpru, RDPRU, X86::RDPRU, 0),
     X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0),
@@ -1495,6 +1498,75 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
     X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB,
                        0),
+
+    X86_INTRINSIC_DATA(mmx_packssdw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_packsswb, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_packuswb, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_padd_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_padd_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_padd_q, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_padd_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_padds_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_padds_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_paddus_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_paddus_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_palignr_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pand, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pandn, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pavg_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pavg_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pcmpeq_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pcmpeq_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pcmpeq_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pcmpgt_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pcmpgt_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pcmpgt_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pextr_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pinsr_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmadd_wd, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmaxs_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmaxu_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmins_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pminu_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmovmskb, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmulh_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmulhu_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmull_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmulu_dq, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_por, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psad_bw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psll_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psll_q, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psll_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pslli_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pslli_q, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pslli_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psra_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psra_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrai_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrai_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrl_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrl_q, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrl_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrli_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrli_q, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrli_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psub_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psub_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psub_q, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psub_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psubs_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psubs_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psubus_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psubus_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_punpckhbw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_punpckhdq, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_punpckhwd, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_punpcklbw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_punpckldq, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_punpcklwd, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pxor, INTR_TYPE_CAST_MMX, 0, 0),
+
     X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0),
     X86_INTRINSIC_DATA(sse_cmp_ss, INTR_TYPE_3OP, X86ISD::FSETCC, 0),
     X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ),
@@ -1503,8 +1575,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(sse_comile_ss, COMI, X86ISD::COMI, ISD::SETLE),
     X86_INTRINSIC_DATA(sse_comilt_ss, COMI, X86ISD::COMI, ISD::SETLT),
     X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE),
+    X86_INTRINSIC_DATA(sse_cvtpd2pi, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(sse_cvtpi2pd, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(sse_cvtpi2ps, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(sse_cvtps2pi, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(sse_cvtss2si, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0),
     X86_INTRINSIC_DATA(sse_cvtss2si64, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0),
+    X86_INTRINSIC_DATA(sse_cvttpd2pi, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(sse_cvttps2pi, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(sse_cvttss2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0),
     X86_INTRINSIC_DATA(sse_cvttss2si64, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0),
     X86_INTRINSIC_DATA(sse_max_ps, INTR_TYPE_2OP, X86ISD::FMAX, 0),
@@ -1512,6 +1590,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(sse_min_ps, INTR_TYPE_2OP, X86ISD::FMIN, 0),
     X86_INTRINSIC_DATA(sse_min_ss, INTR_TYPE_2OP, X86ISD::FMINS, 0),
     X86_INTRINSIC_DATA(sse_movmsk_ps, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+    X86_INTRINSIC_DATA(sse_pshuf_w, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(sse_rcp_ps, INTR_TYPE_1OP, X86ISD::FRCP, 0),
     X86_INTRINSIC_DATA(sse_rsqrt_ps, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
     X86_INTRINSIC_DATA(sse_ucomieq_ss, COMI, X86ISD::UCOMI, ISD::SETEQ),
@@ -1593,14 +1672,29 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(sse41_round_ss, ROUNDS, X86ISD::VRNDSCALES, 0),
     X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
     X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP_IMM8, X86ISD::INSERTQI, 0),
+    X86_INTRINSIC_DATA(ssse3_pabs_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_pabs_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_pabs_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_phadd_d, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+    X86_INTRINSIC_DATA(ssse3_phadd_sw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_phadd_w, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+    X86_INTRINSIC_DATA(ssse3_phsub_d, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+    X86_INTRINSIC_DATA(ssse3_phsub_sw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_phsub_w, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+    X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW,
                        0),
+    X86_INTRINSIC_DATA(ssse3_pmul_hr_sw, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
+    X86_INTRINSIC_DATA(ssse3_pshuf_b, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+    X86_INTRINSIC_DATA(ssse3_psign_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_psign_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_psign_w, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(subborrow_32, ADX, X86ISD::SBB, X86ISD::SUB),
     X86_INTRINSIC_DATA(subborrow_64, ADX, X86ISD::SBB, X86ISD::SUB),
     X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTRI, 0),
diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index a164c82bdf75d..f5a7ab26a49e9 100644
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -962,8 +962,7 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) {
       } else if (NewCB->getType()->isVoidTy()) {
         // If the return value is dead, replace any uses of it with poison
         // (any non-debug value uses will get removed later on).
-        if (!CB.getType()->isX86_MMXTy())
-          CB.replaceAllUsesWith(PoisonValue::get(CB.getType()));
+        CB.replaceAllUsesWith(PoisonValue::get(CB.getType()));
       } else {
         assert((RetTy->isStructTy() || RetTy->isArrayTy()) &&
                "Return type changed, but not into a void. The old return type"
@@ -1027,8 +1026,7 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) {
     } else {
       // If this argument is dead, replace any uses of it with poison
       // (any non-debug value uses will get removed later on).
-      if (!I->getType()->isX86_MMXTy())
-        I->replaceAllUsesWith(PoisonValue::get(I->getType()));
+      I->replaceAllUsesWith(PoisonValue::get(I->getType()));
     }
 
   // If we change the return value of the function we must rewrite any return
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 8f83047020936..ed78201101ab3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2679,13 +2679,6 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
     return replaceInstUsesWith(CI, Src);
 
   if (FixedVectorType *DestVTy = dyn_cast<FixedVectorType>(DestTy)) {
-    // Beware: messing with this target-specific oddity may cause trouble.
-    if (DestVTy->getNumElements() == 1 && SrcTy->isX86_MMXTy()) {
-      Value *Elem = Builder.CreateBitCast(Src, DestVTy->getElementType());
-      return InsertElementInst::Create(PoisonValue::get(DestTy), Elem,
-                     Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
-    }
-
     if (isa<IntegerType>(SrcTy)) {
       // If this is a cast from an integer to vector, check to see if the input
       // is a trunc or zext of a bitcast from vector.  If so, we can replace all
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index c979e81ac1a3f..910c36faf7e0f 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2987,8 +2987,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// Caller guarantees that this intrinsic does not access memory.
   bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I) {
     Type *RetTy = I.getType();
-    if (!(RetTy->isIntOrIntVectorTy() || RetTy->isFPOrFPVectorTy() ||
-          RetTy->isX86_MMXTy()))
+    if (!(RetTy->isIntOrIntVectorTy() || RetTy->isFPOrFPVectorTy()))
       return false;
 
     unsigned NumArgOperands = I.arg_size();
@@ -3218,7 +3217,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // Get an X86_MMX-sized vector type.
+  // Get an MMX-sized vector type.
   Type *getMMXVectorTy(unsigned EltSizeInBits) {
     const unsigned X86_MMXSizeInBits = 64;
     assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
@@ -3264,20 +3263,21 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   // packs elements of 2 input vectors into half as many bits with saturation.
   // Shadow is propagated with the signed variant of the same intrinsic applied
   // to sext(Sa != zeroinitializer), sext(Sb != zeroinitializer).
-  // EltSizeInBits is used only for x86mmx arguments.
-  void handleVectorPackIntrinsic(IntrinsicInst &I, unsigned EltSizeInBits = 0) {
+  // MMXEltSizeInBits is used only for x86mmx arguments.
+  void handleVectorPackIntrinsic(IntrinsicInst &I,
+                                 unsigned MMXEltSizeInBits = 0) {
     assert(I.arg_size() == 2);
-    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
     IRBuilder<> IRB(&I);
     Value *S1 = getShadow(&I, 0);
     Value *S2 = getShadow(&I, 1);
-    assert(isX86_MMX || S1->getType()->isVectorTy());
+    assert(S1->getType()->isVectorTy());
 
     // SExt and ICmpNE below must apply to individual elements of input vectors.
     // In case of x86mmx arguments, cast them to appropriate vector types and
     // back.
-    Type *T = isX86_MMX ? getMMXVectorTy(EltSizeInBits) : S1->getType();
-    if (isX86_MMX) {
+    Type *T =
+        MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits) : S1->getType();
+    if (MMXEltSizeInBits) {
       S1 = IRB.CreateBitCast(S1, T);
       S2 = IRB.CreateBitCast(S2, T);
     }
@@ -3285,10 +3285,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         IRB.CreateSExt(IRB.CreateICmpNE(S1, Constant::getNullValue(T)), T);
     Value *S2_ext =
         IRB.CreateSExt(IRB.CreateICmpNE(S2, Constant::getNullValue(T)), T);
-    if (isX86_MMX) {
-      Type *X86_MMXTy = Type::getX86_MMXTy(*MS.C);
-      S1_ext = IRB.CreateBitCast(S1_ext, X86_MMXTy);
-      S2_ext = IRB.CreateBitCast(S2_ext, X86_MMXTy);
+    if (MMXEltSizeInBits) {
+      S1_ext = IRB.CreateBitCast(S1_ext, getMMXVectorTy(64));
+      S2_ext = IRB.CreateBitCast(S2_ext, getMMXVectorTy(64));
     }
 
     Function *ShadowFn = Intrinsic::getDeclaration(
@@ -3296,7 +3295,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     Value *S =
         IRB.CreateCall(ShadowFn, {S1_ext, S2_ext}, "_msprop_vector_pack");
-    if (isX86_MMX)
+    if (MMXEltSizeInBits)
       S = IRB.CreateBitCast(S, getShadowTy(&I));
     setShadow(&I, S);
     setOriginForNaryOp(I);
@@ -3403,10 +3402,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   // Instrument sum-of-absolute-differences intrinsic.
-  void handleVectorSadIntrinsic(IntrinsicInst &I) {
+  void handleVectorSadIntrinsic(IntrinsicInst &I, bool IsMMX = false) {
     const unsigned SignificantBitsPerResultElement = 16;
-    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
-    Type *ResTy = isX86_MMX ? IntegerType::get(*MS.C, 64) : I.getType();
+    Type *ResTy = IsMMX ? IntegerType::get(*MS.C, 64) : I.getType();
     unsigned ZeroBitsPerResultElement =
         ResTy->getScalarSizeInBits() - SignificantBitsPerResultElement;
 
@@ -3425,9 +3423,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   // Instrument multiply-add intrinsic.
   void handleVectorPmaddIntrinsic(IntrinsicInst &I,
-                                  unsigned EltSizeInBits = 0) {
-    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
-    Type *ResTy = isX86_MMX ? getMMXVectorTy(EltSizeInBits * 2) : I.getType();
+                                  unsigned MMXEltSizeInBits = 0) {
+    Type *ResTy =
+        MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits * 2) : I.getType();
     IRBuilder<> IRB(&I);
     auto *Shadow0 = getShadow(&I, 0);
     auto *Shadow1 = getShadow(&I, 1);
@@ -4161,6 +4159,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       break;
 
     case Intrinsic::x86_mmx_psad_bw:
+      handleVectorSadIntrinsic(I, true);
+      break;
     case Intrinsic::x86_sse2_psad_bw:
     case Intrinsic::x86_avx2_psad_bw:
       handleVectorSadIntrinsic(I);
@@ -5048,7 +5048,7 @@ struct VarArgAMD64Helper : public VarArgHelperBase {
     Type *T = arg->getType();
     if (T->isX86_FP80Ty())
       return AK_Memory;
-    if (T->isFPOrFPVectorTy() || T->isX86_MMXTy())
+    if (T->isFPOrFPVectorTy())
       return AK_FloatingPoint;
     if (T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64)
       return AK_GeneralPurpose;
diff --git a/llvm/test/Assembler/x86mmx.ll b/llvm/test/Assembler/x86mmx.ll
deleted file mode 100644
index 608347e0fceb1..0000000000000
--- a/llvm/test/Assembler/x86mmx.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: llvm-as < %s | llvm-dis | FileCheck %s
-; RUN: verify-uselistorder %s
-; Basic smoke test for x86_mmx type.
-
-; CHECK: define x86_mmx @sh16
-define x86_mmx  @sh16(x86_mmx %A) {
-; CHECK: ret x86_mmx %A
-        ret x86_mmx %A
-}
diff --git a/llvm/test/Bitcode/bcanalyzer-types.ll b/llvm/test/Bitcode/bcanalyzer-types.ll
index cbe6f5d22c947..f1732db174c29 100644
--- a/llvm/test/Bitcode/bcanalyzer-types.ll
+++ b/llvm/test/Bitcode/bcanalyzer-types.ll
@@ -3,7 +3,6 @@
 ; CHECK: Block ID {{.*}} (TYPE_BLOCK_ID)
 ; CHECK: BFLOAT
 ; CHECK: TOKEN
-; CHECK: X86_MMX
 ; CHECK: HALF
 ; CHECK: Block ID
 
@@ -12,11 +11,6 @@ define half @test_half(half %x, half %y) {
   ret half %a
 }
 
-define x86_mmx @test_mmx(<2 x i32> %x) {
-  %a = bitcast <2 x i32> %x to x86_mmx
-  ret x86_mmx %a
-}
-
 define bfloat @test_bfloat(i16 %x) {
   %a = bitcast i16 %x to bfloat
   ret bfloat %a
diff --git a/llvm/test/Bitcode/compatibility-3.6.ll b/llvm/test/Bitcode/compatibility-3.6.ll
index 2190e2fbccf28..37a87eea41ad3 100644
--- a/llvm/test/Bitcode/compatibility-3.6.ll
+++ b/llvm/test/Bitcode/compatibility-3.6.ll
@@ -645,7 +645,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility-3.7.ll b/llvm/test/Bitcode/compatibility-3.7.ll
index 7e59b5c1be6e2..8de2132d7ec89 100644
--- a/llvm/test/Bitcode/compatibility-3.7.ll
+++ b/llvm/test/Bitcode/compatibility-3.7.ll
@@ -689,7 +689,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility-3.8.ll b/llvm/test/Bitcode/compatibility-3.8.ll
index ebd1f2fff8c94..7f766aa34a005 100644
--- a/llvm/test/Bitcode/compatibility-3.8.ll
+++ b/llvm/test/Bitcode/compatibility-3.8.ll
@@ -742,7 +742,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility-3.9.ll b/llvm/test/Bitcode/compatibility-3.9.ll
index c34f04ceb0de3..c8309175e063f 100644
--- a/llvm/test/Bitcode/compatibility-3.9.ll
+++ b/llvm/test/Bitcode/compatibility-3.9.ll
@@ -813,7 +813,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility-4.0.ll b/llvm/test/Bitcode/compatibility-4.0.ll
index 05bffda1d117a..adbd91ac6c7fe 100644
--- a/llvm/test/Bitcode/compatibility-4.0.ll
+++ b/llvm/test/Bitcode/compatibility-4.0.ll
@@ -813,7 +813,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility-5.0.ll b/llvm/test/Bitcode/compatibility-5.0.ll
index 0c872289c62ba..1b500da69568a 100644
--- a/llvm/test/Bitcode/compatibility-5.0.ll
+++ b/llvm/test/Bitcode/compatibility-5.0.ll
@@ -820,7 +820,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility-6.0.ll b/llvm/test/Bitcode/compatibility-6.0.ll
index 44c680885be34..c1abbf0cda6eb 100644
--- a/llvm/test/Bitcode/compatibility-6.0.ll
+++ b/llvm/test/Bitcode/compatibility-6.0.ll
@@ -830,7 +830,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index e437c37d8d1c8..a7567038b7a7b 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -1113,7 +1113,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca ptr
   ; CHECK: %t8 = alloca ptr
   %t9 = alloca <4 x i32>
diff --git a/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll b/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
index ac86279ca6667..3a112ae2a2113 100644
--- a/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
+++ b/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
@@ -29,7 +29,17 @@ entry:
 define <2 x double> @a2(x86_mmx %x) nounwind {
 ; CHECK-LABEL: a2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvtpi2pd %mm0, %xmm0
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    andl $-8, %esp
+; CHECK-NEXT:    subl $8, %esp
+; CHECK-NEXT:    movl 8(%ebp), %eax
+; CHECK-NEXT:    movl 12(%ebp), %ecx
+; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %eax, (%esp)
+; CHECK-NEXT:    cvtpi2pd (%esp), %xmm0
+; CHECK-NEXT:    movl %ebp, %esp
+; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl
 entry:
   %y = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %x)
@@ -39,7 +49,16 @@ entry:
 define x86_mmx @b2(<2 x double> %x) nounwind {
 ; CHECK-LABEL: b2:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    andl $-8, %esp
+; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    cvttpd2pi %xmm0, %mm0
+; CHECK-NEXT:    movq %mm0, (%esp)
+; CHECK-NEXT:    movl (%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl %ebp, %esp
+; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl
 entry:
   %y = tail call x86_mmx @llvm.x86.sse.cvttpd2pi (<2 x double> %x)
diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
index b442a6337e3b8..3f6f8c01b9049 100644
--- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
@@ -1014,17 +1014,13 @@ define float @broadcast_lifetime() nounwind {
 define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind {
 ; X86-LABEL: broadcast_x86_mmx:
 ; X86:       ## %bb.0: ## %bb
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: broadcast_x86_mmx:
 ; X64:       ## %bb.0: ## %bb
-; X64-NEXT:    movdq2q %xmm0, %mm0
-; X64-NEXT:    movq2dq %mm0, %xmm0
-; X64-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; X64-NEXT:    vmovq %rdi, %xmm0
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; X64-NEXT:    retq
 bb:
   %tmp1 = bitcast x86_mmx %tmp to i64
diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
index b7516d30df5f6..fed6c2eb8ba0a 100644
--- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -1452,25 +1452,18 @@ eintry:
 define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind {
 ; X86-LABEL: broadcast_x86_mmx:
 ; X86:       ## %bb.0: ## %bb
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: broadcast_x86_mmx:
 ; X64-AVX2:       ## %bb.0: ## %bb
-; X64-AVX2-NEXT:    movdq2q %xmm0, %mm0
-; X64-AVX2-NEXT:    movq %mm0, %rax
-; X64-AVX2-NEXT:    vmovq %rax, %xmm0
+; X64-AVX2-NEXT:    vmovq %rdi, %xmm0
 ; X64-AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; X64-AVX2-NEXT:    retq
 ;
 ; X64-AVX512VL-LABEL: broadcast_x86_mmx:
 ; X64-AVX512VL:       ## %bb.0: ## %bb
-; X64-AVX512VL-NEXT:    movdq2q %xmm0, %mm0
-; X64-AVX512VL-NEXT:    movq %mm0, %rax
-; X64-AVX512VL-NEXT:    vpbroadcastq %rax, %xmm0
+; X64-AVX512VL-NEXT:    vpbroadcastq %rdi, %xmm0
 ; X64-AVX512VL-NEXT:    retq
 bb:
   %tmp1 = bitcast x86_mmx %tmp to i64
diff --git a/llvm/test/CodeGen/X86/fast-isel-bc.ll b/llvm/test/CodeGen/X86/fast-isel-bc.ll
index 0fbc9fab05681..e3bb5e7176e57 100644
--- a/llvm/test/CodeGen/X86/fast-isel-bc.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-bc.ll
@@ -12,7 +12,11 @@ define void @func1() nounwind {
 ; X86-LABEL: func1:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    movq {{\.?LCPI[0-9]+_[0-9]+}}, %mm0 ## mm0 = 0x200000000
+; X86-NEXT:    movl $2, %edx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl %esp, %eax
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    calll _func2
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -20,8 +24,7 @@ define void @func1() nounwind {
 ; X64-LABEL: func1:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    movq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %mm0 ## mm0 = 0x200000000
-; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movabsq $8589934592, %rdi ## imm = 0x200000000
 ; X64-NEXT:    callq _func2
 ; X64-NEXT:    popq %rax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll b/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
index a4dbb10e0d7a0..54f048eb697f6 100644
--- a/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
+++ b/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
@@ -10,8 +10,8 @@ define void @t3() nounwind  {
 ; X86-64-LABEL: t3:
 ; X86-64:       ## %bb.0:
 ; X86-64-NEXT:    movq _g_v8qi at GOTPCREL(%rip), %rax
-; X86-64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-64-NEXT:    movb $1, %al
+; X86-64-NEXT:    movq (%rax), %rdi
+; X86-64-NEXT:    xorl %eax, %eax
 ; X86-64-NEXT:    jmp _pass_v8qi ## TAILCALL
   %tmp3 = load <8 x i8>, ptr @g_v8qi, align 8
   %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
@@ -22,12 +22,11 @@ define void @t3() nounwind  {
 define void @t4(x86_mmx %v1, x86_mmx %v2) nounwind  {
 ; X86-64-LABEL: t4:
 ; X86-64:       ## %bb.0:
-; X86-64-NEXT:    movdq2q %xmm1, %mm0
-; X86-64-NEXT:    movdq2q %xmm0, %mm1
-; X86-64-NEXT:    movq2dq %mm1, %xmm1
-; X86-64-NEXT:    movq2dq %mm0, %xmm0
-; X86-64-NEXT:    paddb %xmm1, %xmm0
-; X86-64-NEXT:    movb $1, %al
+; X86-64-NEXT:    movq %rdi, %xmm0
+; X86-64-NEXT:    movq %rsi, %xmm1
+; X86-64-NEXT:    paddb %xmm0, %xmm1
+; X86-64-NEXT:    movq %xmm1, %rdi
+; X86-64-NEXT:    xorl %eax, %eax
 ; X86-64-NEXT:    jmp _pass_v8qi ## TAILCALL
   %v1a = bitcast x86_mmx %v1 to <8 x i8>
   %v2b = bitcast x86_mmx %v2 to <8 x i8>
diff --git a/llvm/test/CodeGen/X86/mmx-arg-passing.ll b/llvm/test/CodeGen/X86/mmx-arg-passing.ll
index af116a2ac281b..1ae9920873faf 100644
--- a/llvm/test/CodeGen/X86/mmx-arg-passing.ll
+++ b/llvm/test/CodeGen/X86/mmx-arg-passing.ll
@@ -13,15 +13,17 @@
 define void @t1(x86_mmx %v1) nounwind  {
 ; X86-32-LABEL: t1:
 ; X86-32:       ## %bb.0:
-; X86-32-NEXT:    movl L_u1$non_lazy_ptr, %eax
-; X86-32-NEXT:    movq %mm0, (%eax)
+; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT:    movl L_u1$non_lazy_ptr, %edx
+; X86-32-NEXT:    movl %ecx, 4(%edx)
+; X86-32-NEXT:    movl %eax, (%edx)
 ; X86-32-NEXT:    retl
 ;
 ; X86-64-LABEL: t1:
 ; X86-64:       ## %bb.0:
-; X86-64-NEXT:    movdq2q %xmm0, %mm0
 ; X86-64-NEXT:    movq _u1 at GOTPCREL(%rip), %rax
-; X86-64-NEXT:    movq %mm0, (%rax)
+; X86-64-NEXT:    movq %rdi, (%rax)
 ; X86-64-NEXT:    retq
 	store x86_mmx %v1, ptr @u1, align 8
 	ret void
diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll
index 68287a4feee47..230e763a7c734 100644
--- a/llvm/test/CodeGen/X86/mmx-arith.ll
+++ b/llvm/test/CodeGen/X86/mmx-arith.ll
@@ -18,8 +18,8 @@ define void @test0(ptr %A, ptr %B) nounwind {
 ; X86-NEXT:    paddsb (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    paddusb (%ecx), %mm0
-; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq2dq %mm0, %xmm0
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    psubb %xmm1, %xmm0
 ; X86-NEXT:    movdq2q %xmm0, %mm0
@@ -27,8 +27,8 @@ define void @test0(ptr %A, ptr %B) nounwind {
 ; X86-NEXT:    psubsb (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    psubusb (%ecx), %mm0
-; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq2dq %mm0, %xmm0
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -58,8 +58,8 @@ define void @test0(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    paddsb (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    paddusb (%rsi), %mm0
-; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    psubb %xmm1, %xmm0
 ; X64-NEXT:    movdq2q %xmm0, %mm0
@@ -67,8 +67,8 @@ define void @test0(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    psubsb (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    psubusb (%rsi), %mm0
-; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -239,8 +239,13 @@ entry:
 define void @test2(ptr %A, ptr %B) nounwind {
 ; X86-LABEL: test2:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    paddw %xmm0, %xmm1
@@ -249,8 +254,8 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X86-NEXT:    paddsw (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    paddusw (%ecx), %mm0
-; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq2dq %mm0, %xmm0
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    psubw %xmm1, %xmm0
 ; X86-NEXT:    movdq2q %xmm0, %mm0
@@ -258,8 +263,8 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X86-NEXT:    psubsw (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    psubusw (%ecx), %mm0
-; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq2dq %mm0, %xmm0
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    pmullw %xmm0, %xmm1
 ; X86-NEXT:    movdq2q %xmm1, %mm0
@@ -267,18 +272,26 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X86-NEXT:    pmulhw (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    pmaddwd (%ecx), %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movq %mm0, (%eax)
-; X86-NEXT:    movq2dq %mm0, %xmm0
-; X86-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT:    andps %xmm0, %xmm1
-; X86-NEXT:    movlps %xmm1, (%eax)
-; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    orps %xmm1, %xmm0
-; X86-NEXT:    movlps %xmm0, (%eax)
-; X86-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT:    xorps %xmm0, %xmm1
-; X86-NEXT:    movlps %xmm1, (%eax)
+; X86-NEXT:    andl 4(%ecx), %esi
+; X86-NEXT:    movd %esi, %xmm0
+; X86-NEXT:    andl (%ecx), %edx
+; X86-NEXT:    movd %edx, %xmm1
+; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-NEXT:    movq %xmm1, (%eax)
+; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    por %xmm1, %xmm0
+; X86-NEXT:    movq %xmm0, (%eax)
+; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    pxor %xmm0, %xmm1
+; X86-NEXT:    movq %xmm1, (%eax)
 ; X86-NEXT:    emms
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test2:
@@ -291,8 +304,8 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    paddsw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    paddusw (%rsi), %mm0
-; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    psubw %xmm1, %xmm0
 ; X64-NEXT:    movdq2q %xmm0, %mm0
@@ -300,8 +313,8 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    psubsw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    psubusw (%rsi), %mm0
-; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    pmullw %xmm0, %xmm1
 ; X64-NEXT:    movdq2q %xmm1, %mm0
@@ -309,17 +322,17 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    pmulhw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    pmaddwd (%rsi), %mm0
+; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    movq %mm0, (%rdi)
-; X64-NEXT:    movq2dq %mm0, %xmm0
-; X64-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    andps %xmm0, %xmm1
-; X64-NEXT:    movlps %xmm1, (%rdi)
-; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    orps %xmm1, %xmm0
-; X64-NEXT:    movlps %xmm0, (%rdi)
-; X64-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    xorps %xmm0, %xmm1
-; X64-NEXT:    movlps %xmm1, (%rdi)
+; X64-NEXT:    andq (%rsi), %rax
+; X64-NEXT:    movq %rax, %xmm0
+; X64-NEXT:    movq %rax, (%rdi)
+; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    por %xmm0, %xmm1
+; X64-NEXT:    movq %xmm1, (%rdi)
+; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    movq %xmm0, (%rdi)
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll b/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll
index 0fa7b24ff445a..a1240911cd36a 100644
--- a/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll
+++ b/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -mtriple=x86_64-- -passes=early-cse -earlycse-debug-hash < %s -S | FileCheck %s
 
-; CHECK: @foo(x86_mmx bitcast (double 0.000000e+00 to x86_mmx))
+; CHECK: @foo(<1 x i64> zeroinitializer)
 
 define void @bar() {
 entry:
diff --git a/llvm/test/CodeGen/X86/mmx-bitcast.ll b/llvm/test/CodeGen/X86/mmx-bitcast.ll
index f914b8622fcf4..49c2027f06604 100644
--- a/llvm/test/CodeGen/X86/mmx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/mmx-bitcast.ll
@@ -58,8 +58,8 @@ define i64 @t3(ptr %p) {
 define void @t4(<1 x i64> %A, <1 x i64> %B) {
 ; CHECK-LABEL: t4:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movq %rdi, %mm0
-; CHECK-NEXT:    movq %rsi, %mm1
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
 ; CHECK-NEXT:    paddusw %mm0, %mm1
 ; CHECK-NEXT:    movq _R at GOTPCREL(%rip), %rax
 ; CHECK-NEXT:    movq %mm1, (%rax)
diff --git a/llvm/test/CodeGen/X86/mmx-cvt.ll b/llvm/test/CodeGen/X86/mmx-cvt.ll
index c09c417c11c96..11473f3f6c236 100644
--- a/llvm/test/CodeGen/X86/mmx-cvt.ll
+++ b/llvm/test/CodeGen/X86/mmx-cvt.ll
@@ -8,20 +8,10 @@
 define void @cvt_v2f64_v2i32(<2 x double>, ptr) nounwind {
 ; X86-LABEL: cvt_v2f64_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvtpd2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cvt_v2f64_v2i32:
@@ -44,20 +34,10 @@ define void @cvt_v2f64_v2i32(<2 x double>, ptr) nounwind {
 define void @cvtt_v2f64_v2i32(<2 x double>, ptr) nounwind {
 ; X86-LABEL: cvtt_v2f64_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttpd2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cvtt_v2f64_v2i32:
@@ -80,20 +60,10 @@ define void @cvtt_v2f64_v2i32(<2 x double>, ptr) nounwind {
 define void @fptosi_v2f64_v2i32(<2 x double>, ptr) nounwind {
 ; X86-LABEL: fptosi_v2f64_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttpd2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fptosi_v2f64_v2i32:
@@ -114,20 +84,10 @@ define void @fptosi_v2f64_v2i32(<2 x double>, ptr) nounwind {
 define void @cvt_v2f32_v2i32(<4 x float>, ptr) nounwind {
 ; X86-LABEL: cvt_v2f32_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvtps2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cvt_v2f32_v2i32:
@@ -150,20 +110,10 @@ define void @cvt_v2f32_v2i32(<4 x float>, ptr) nounwind {
 define void @cvtt_v2f32_v2i32(<4 x float>, ptr) nounwind {
 ; X86-LABEL: cvtt_v2f32_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttps2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cvtt_v2f32_v2i32:
@@ -186,20 +136,10 @@ define void @cvtt_v2f32_v2i32(<4 x float>, ptr) nounwind {
 define void @fptosi_v4f32_v4i32(<4 x float>, ptr) nounwind {
 ; X86-LABEL: fptosi_v4f32_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttps2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fptosi_v4f32_v4i32:
@@ -221,20 +161,10 @@ define void @fptosi_v4f32_v4i32(<4 x float>, ptr) nounwind {
 define void @fptosi_v2f32_v2i32(<4 x float>, ptr) nounwind {
 ; X86-LABEL: fptosi_v2f32_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttps2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fptosi_v2f32_v2i32:
diff --git a/llvm/test/CodeGen/X86/mmx-fold-load.ll b/llvm/test/CodeGen/X86/mmx-fold-load.ll
index 73df6be8d7989..a31339902bb64 100644
--- a/llvm/test/CodeGen/X86/mmx-fold-load.ll
+++ b/llvm/test/CodeGen/X86/mmx-fold-load.ll
@@ -288,8 +288,13 @@ define i64 @tt0(x86_mmx %t, ptr %q) nounwind {
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddb (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -301,7 +306,8 @@ define i64 @tt0(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt0:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    paddb (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    paddb (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
@@ -321,8 +327,13 @@ define i64 @tt1(x86_mmx %t, ptr %q) nounwind {
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddw (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -334,7 +345,8 @@ define i64 @tt1(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt1:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    paddw (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    paddw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
@@ -353,8 +365,13 @@ define i64 @tt2(x86_mmx %t, ptr %q) nounwind {
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddd (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -366,7 +383,8 @@ define i64 @tt2(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt2:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    paddd (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    paddd (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
@@ -385,8 +403,13 @@ define i64 @tt3(x86_mmx %t, ptr %q) nounwind {
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddq (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -398,7 +421,8 @@ define i64 @tt3(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt3:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    paddq (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    paddq (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
@@ -417,8 +441,13 @@ define i64 @tt4(x86_mmx %t, ptr %q) nounwind {
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddusb (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -430,7 +459,8 @@ define i64 @tt4(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt4:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    paddusb (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    paddusb (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
@@ -449,8 +479,13 @@ define i64 @tt5(x86_mmx %t, ptr %q) nounwind {
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddusw (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -462,7 +497,8 @@ define i64 @tt5(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt5:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    paddusw (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    paddusw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
@@ -481,8 +517,13 @@ define i64 @tt6(x86_mmx %t, ptr %q) nounwind {
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    psrlw (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -494,7 +535,8 @@ define i64 @tt6(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt6:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    psrlw (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    psrlw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
@@ -513,8 +555,13 @@ define i64 @tt7(x86_mmx %t, ptr %q) nounwind {
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    psrld (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -526,7 +573,8 @@ define i64 @tt7(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt7:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    psrld (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    psrld (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
@@ -545,8 +593,13 @@ define i64 @tt8(x86_mmx %t, ptr %q) nounwind {
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    psrlq (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -558,7 +611,8 @@ define i64 @tt8(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt8:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    psrlq (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    psrlq (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
@@ -612,16 +666,29 @@ declare void @llvm.lifetime.end(i64, ptr nocapture)
 define x86_mmx @vec_load(ptr %x) {
 ; X86-LABEL: vec_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    pshufw $68, (%eax), %mm0 # mm0 = mem[0,1,0,1]
 ; X86-NEXT:    paddsb %mm0, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: vec_load:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pshufw $68, (%rdi), %mm0 # mm0 = mem[0,1,0,1]
 ; X64-NEXT:    paddsb %mm0, %mm0
-; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
   %z = load <4 x float>, ptr %x
   %y = extractelement <4 x float> %z, i32 0
diff --git a/llvm/test/CodeGen/X86/mmx-intrinsics.ll b/llvm/test/CodeGen/X86/mmx-intrinsics.ll
index a43d9400cde6c..69fc636107544 100644
--- a/llvm/test/CodeGen/X86/mmx-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/mmx-intrinsics.ll
@@ -32,10 +32,10 @@ define i64 @test1(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test1:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    phaddw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    phaddw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
@@ -77,10 +77,10 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test88:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pcmpgtd %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pcmpgtd %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
@@ -122,10 +122,10 @@ define i64 @test87(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test87:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pcmpgtw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pcmpgtw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
@@ -167,10 +167,10 @@ define i64 @test86(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test86:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pcmpgtb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pcmpgtb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
@@ -212,10 +212,10 @@ define i64 @test85(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test85:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pcmpeqd %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pcmpeqd %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
@@ -257,10 +257,10 @@ define i64 @test84(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test84:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pcmpeqw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pcmpeqw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
@@ -302,10 +302,10 @@ define i64 @test83(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test83:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pcmpeqb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pcmpeqb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
@@ -347,10 +347,10 @@ define i64 @test82(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test82:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0]
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
@@ -392,10 +392,10 @@ define i64 @test81(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test81:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1]
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
@@ -437,10 +437,10 @@ define i64 @test80(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test80:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3]
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
@@ -482,10 +482,10 @@ define i64 @test79(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test79:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1]
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    punpckhdq %mm0, %mm1 # mm1 = mm1[1],mm0[1]
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
@@ -527,10 +527,10 @@ define i64 @test78(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test78:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3]
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    punpckhwd %mm0, %mm1 # mm1 = mm1[2],mm0[2],mm1[3],mm0[3]
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
@@ -572,10 +572,10 @@ define i64 @test77(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test77:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7]
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    punpckhbw %mm0, %mm1 # mm1 = mm1[4],mm0[4],mm1[5],mm0[5],mm1[6],mm0[6],mm1[7],mm0[7]
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
@@ -617,10 +617,10 @@ define i64 @test76(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test76:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    packuswb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    packuswb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
@@ -662,10 +662,10 @@ define i64 @test75(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test75:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    packssdw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    packssdw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
@@ -707,10 +707,10 @@ define i64 @test74(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test74:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    packsswb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    packsswb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
@@ -1459,8 +1459,8 @@ define i64 @test56(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test56:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pxor %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -1504,8 +1504,8 @@ define i64 @test55(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test55:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    por %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -1549,10 +1549,10 @@ define i64 @test54(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test54:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pandn %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pandn %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
@@ -1594,8 +1594,8 @@ define i64 @test53(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test53:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pand %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -1639,8 +1639,8 @@ define i64 @test52(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test52:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmullw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -1682,8 +1682,8 @@ define i64 @test51(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test51:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmullw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -1727,8 +1727,8 @@ define i64 @test50(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test50:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmulhw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -1772,8 +1772,8 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test49:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmaddwd %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -1817,10 +1817,10 @@ define i64 @test48(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test48:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubusw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubusw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
@@ -1862,10 +1862,10 @@ define i64 @test47(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test47:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubusb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubusb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
@@ -1907,10 +1907,10 @@ define i64 @test46(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test46:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubsw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubsw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
@@ -1952,10 +1952,10 @@ define i64 @test45(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test45:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubsb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubsb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
@@ -2032,10 +2032,10 @@ define i64 @test43(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test43:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubd %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubd %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
@@ -2077,10 +2077,10 @@ define i64 @test42(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test42:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
@@ -2122,10 +2122,10 @@ define i64 @test41(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test41:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
@@ -2167,8 +2167,8 @@ define i64 @test40(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test40:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddusw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -2212,8 +2212,8 @@ define i64 @test39(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test39:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddusb %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -2257,8 +2257,8 @@ define i64 @test38(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test38:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddsw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -2302,8 +2302,8 @@ define i64 @test37(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test37:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddsb %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -2382,8 +2382,8 @@ define i64 @test35(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test35:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddd %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -2427,8 +2427,8 @@ define i64 @test34(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test34:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -2472,8 +2472,8 @@ define i64 @test33(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test33:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddb %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -2517,8 +2517,8 @@ define i64 @test32(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test32:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    psadbw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -2560,8 +2560,8 @@ define i64 @test31(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test31:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pminsw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -2605,8 +2605,8 @@ define i64 @test30(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test30:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pminub %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -2650,8 +2650,8 @@ define i64 @test29(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test29:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmaxsw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -2695,8 +2695,8 @@ define i64 @test28(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test28:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmaxub %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -2740,8 +2740,8 @@ define i64 @test27(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test27:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pavgw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -2785,8 +2785,8 @@ define i64 @test26(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test26:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pavgb %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -2884,10 +2884,10 @@ define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp {
 ;
 ; X64-LABEL: test23:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    maskmovq %mm1, %mm0
+; X64-NEXT:    maskmovq %mm0, %mm1
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %n to <8 x i8>
@@ -2926,8 +2926,8 @@ define i64 @test22(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test22:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmulhuw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -3041,8 +3041,8 @@ define i64 @test20(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test20:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmuludq %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -3320,10 +3320,10 @@ define i64 @test12(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test12:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psignd %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psignd %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
@@ -3365,10 +3365,10 @@ define i64 @test11(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test11:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psignw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psignw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
@@ -3410,10 +3410,10 @@ define i64 @test10(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test10:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psignb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psignb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
@@ -3455,10 +3455,10 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test9:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pshufb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pshufb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
@@ -3500,8 +3500,8 @@ define i64 @test8(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test8:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmulhrsw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
@@ -3545,10 +3545,10 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test7:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pmaddubsw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pmaddubsw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
@@ -3590,10 +3590,10 @@ define i64 @test6(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test6:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    phsubsw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    phsubsw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
@@ -3635,10 +3635,10 @@ define i64 @test5(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test5:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    phsubd %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    phsubd %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
@@ -3680,10 +3680,10 @@ define i64 @test4(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test4:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    phsubw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    phsubw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
@@ -3725,10 +3725,10 @@ define i64 @test3(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test3:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    phaddsw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    phaddsw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
@@ -3770,10 +3770,10 @@ define i64 @test2(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test2:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    phaddd %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    phaddd %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
@@ -3788,10 +3788,26 @@ entry:
 }
 
 define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind {
-; ALL-LABEL: test89:
-; ALL:       # %bb.0:
-; ALL-NEXT:    cvtpi2ps %mm0, %xmm0
-; ALL-NEXT:    ret{{[l|q]}}
+; X86-LABEL: test89:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    cvtpi2ps (%esp), %xmm0
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test89:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    cvtpi2ps %mm0, %xmm0
+; X64-NEXT:    retq
   %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, x86_mmx %b)
   ret <4 x float> %c
 }
diff --git a/llvm/test/CodeGen/X86/pr23246.ll b/llvm/test/CodeGen/X86/pr23246.ll
index 45587b8c69cd4..cd0ece12a1916 100644
--- a/llvm/test/CodeGen/X86/pr23246.ll
+++ b/llvm/test/CodeGen/X86/pr23246.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 define <2 x i64> @test(x86_mmx %a) #0 {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %rdi, %xmm0
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; CHECK-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/pr29222.ll b/llvm/test/CodeGen/X86/pr29222.ll
index 9a38515b65594..1ddcb1fb56524 100644
--- a/llvm/test/CodeGen/X86/pr29222.ll
+++ b/llvm/test/CodeGen/X86/pr29222.ll
@@ -32,7 +32,7 @@ define i32 @PR29222(i32) nounwind {
 ; X86-AVX-NEXT:    pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
 ; X86-AVX-NEXT:    packsswb %mm0, %mm0
 ; X86-AVX-NEXT:    movq %mm0, (%esp)
-; X86-AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vpbroadcastq (%esp), %xmm0
 ; X86-AVX-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX-NEXT:    movl %ebp, %esp
@@ -54,7 +54,9 @@ define i32 @PR29222(i32) nounwind {
 ; X64-AVX-NEXT:    movd %edi, %mm0
 ; X64-AVX-NEXT:    pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
 ; X64-AVX-NEXT:    packsswb %mm0, %mm0
-; X64-AVX-NEXT:    movq2dq %mm0, %xmm0
+; X64-AVX-NEXT:    movq %mm0, %rax
+; X64-AVX-NEXT:    vmovq %rax, %xmm0
+; X64-AVX-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; X64-AVX-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/select-mmx.ll b/llvm/test/CodeGen/X86/select-mmx.ll
index 27b7ebb8381cd..8339cb71d4671 100644
--- a/llvm/test/CodeGen/X86/select-mmx.ll
+++ b/llvm/test/CodeGen/X86/select-mmx.ll
@@ -14,15 +14,11 @@ define i64 @test47(i64 %arg)  {
 ;
 ; X64-LABEL: test47:
 ; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    testq %rdi, %rdi
-; X64-NEXT:    je .LBB0_1
-; X64-NEXT:  # %bb.2:
-; X64-NEXT:    pxor %mm0, %mm0
-; X64-NEXT:    jmp .LBB0_3
-; X64-NEXT:  .LBB0_1:
-; X64-NEXT:    movl $7, %eax
-; X64-NEXT:    movd %eax, %mm0
-; X64-NEXT:  .LBB0_3:
+; X64-NEXT:    movl $7, %ecx
+; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    movq %rcx, %mm0
 ; X64-NEXT:    psllw %mm0, %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
@@ -35,17 +31,17 @@ define i64 @test47(i64 %arg)  {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    orl 12(%ebp), %eax
-; X86-NEXT:    je .LBB0_1
-; X86-NEXT:  # %bb.2:
-; X86-NEXT:    pxor %mm0, %mm0
-; X86-NEXT:    jmp .LBB0_3
-; X86-NEXT:  .LBB0_1:
 ; X86-NEXT:    movl $7, %eax
-; X86-NEXT:    movd %eax, %mm0
-; X86-NEXT:  .LBB0_3:
+; X86-NEXT:    je .LBB0_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:  .LBB0_2:
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    psllw %mm0, %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -74,13 +70,8 @@ define i64 @test49(i64 %arg, i64 %x, i64 %y) {
 ; X64-LABEL: test49:
 ; X64:       # %bb.0:
 ; X64-NEXT:    testq %rdi, %rdi
-; X64-NEXT:    je .LBB1_1
-; X64-NEXT:  # %bb.2:
-; X64-NEXT:    movq %rdx, %mm0
-; X64-NEXT:    jmp .LBB1_3
-; X64-NEXT:  .LBB1_1:
+; X64-NEXT:    cmovneq %rdx, %rsi
 ; X64-NEXT:    movq %rsi, %mm0
-; X64-NEXT:  .LBB1_3:
 ; X64-NEXT:    psllw %mm0, %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/stack-folding-mmx.ll b/llvm/test/CodeGen/X86/stack-folding-mmx.ll
index 11ca9e2a547ee..6652a8ca0dbd5 100644
--- a/llvm/test/CodeGen/X86/stack-folding-mmx.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-mmx.ll
@@ -9,7 +9,7 @@ define x86_mmx @stack_fold_cvtpd2pi(<2 x double> %a0) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    cvtpd2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a0) nounwind readnone
@@ -20,6 +20,7 @@ declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
 define <2 x double> @stack_fold_cvtpi2pd(x86_mmx %a0) {
 ; CHECK-LABEL: stack_fold_cvtpi2pd:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
@@ -35,6 +36,7 @@ declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone
 define <4 x float> @stack_fold_cvtpi2ps(<4 x float> %a0, x86_mmx %a1) {
 ; CHECK-LABEL: stack_fold_cvtpi2ps:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
@@ -55,7 +57,7 @@ define x86_mmx @stack_fold_cvtps2pi(<4 x float> %a0) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    cvtps2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %a0) nounwind readnone
@@ -71,7 +73,7 @@ define x86_mmx @stack_fold_cvttpd2pi(<2 x double> %a0) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    cvttpd2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a0) nounwind readnone
@@ -87,7 +89,7 @@ define x86_mmx @stack_fold_cvttps2pi(<4 x float> %a0) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    cvttps2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = call x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float> %a0) nounwind readnone
@@ -107,6 +109,7 @@ define i32 @stack_fold_movd_store(x86_mmx %a0) nounwind {
 ; CHECK-NEXT:    pushq %r13
 ; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    paddb %mm0, %mm0
 ; CHECK-NEXT:    movd %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    #APP
@@ -139,6 +142,7 @@ define i64 @stack_fold_movq_store(x86_mmx %a0) nounwind {
 ; CHECK-NEXT:    pushq %r13
 ; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    paddb %mm0, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
@@ -161,12 +165,13 @@ define i64 @stack_fold_movq_store(x86_mmx %a0) nounwind {
 define x86_mmx @stack_fold_pabsb(x86_mmx %a0) {
 ; CHECK-LABEL: stack_fold_pabsb:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    pabsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %a0) nounwind readnone
@@ -177,12 +182,13 @@ declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pabsd(x86_mmx %a0) {
 ; CHECK-LABEL: stack_fold_pabsd:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    pabsd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %a0) nounwind readnone
@@ -193,12 +199,13 @@ declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pabsw(x86_mmx %a0) {
 ; CHECK-LABEL: stack_fold_pabsw:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    pabsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %a0) nounwind readnone
@@ -209,12 +216,13 @@ declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_packssdw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_packssdw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    packssdw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    packssdw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -225,12 +233,13 @@ declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_packsswb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_packsswb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    packsswb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    packsswb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -241,12 +250,13 @@ declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_packuswb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_packuswb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    packuswb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    packuswb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -257,12 +267,13 @@ declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_paddb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_paddb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -273,12 +284,13 @@ declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_paddd(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_paddd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -289,12 +301,13 @@ declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_paddq(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_paddq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddq %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -305,12 +318,13 @@ declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_paddsb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_paddsb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddsb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -321,12 +335,13 @@ declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_paddsw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_paddsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -337,12 +352,13 @@ declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_paddusb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_paddusb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddusb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddusb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -353,12 +369,13 @@ declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_paddusw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_paddusw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddusw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddusw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -369,12 +386,13 @@ declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_paddw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_paddw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -385,12 +403,13 @@ declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_palignr(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_palignr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    palignr $1, %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    palignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %a, x86_mmx %b, i8 1) nounwind readnone
@@ -401,12 +420,13 @@ declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone
 define x86_mmx @stack_fold_pand(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pand:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pand %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pand {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pand(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -417,12 +437,13 @@ declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pandn(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pandn:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pandn %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -433,12 +454,13 @@ declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pavgb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pavgb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pavgb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pavgb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -449,12 +471,13 @@ declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pavgw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pavgw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pavgw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pavgw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -465,12 +488,13 @@ declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pcmpeqb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pcmpeqb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pcmpeqb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -481,12 +505,13 @@ declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pcmpeqd(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pcmpeqd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pcmpeqd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -497,12 +522,13 @@ declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pcmpeqw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pcmpeqw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pcmpeqw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -513,12 +539,13 @@ declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pcmpgtb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pcmpgtb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pcmpgtb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pcmpgtb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -529,12 +556,13 @@ declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pcmpgtd(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pcmpgtd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pcmpgtd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pcmpgtd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -545,12 +573,13 @@ declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pcmpgtw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pcmpgtw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pcmpgtw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pcmpgtw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -561,12 +590,13 @@ declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_phaddd(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_phaddd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    phaddd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    phaddd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -577,12 +607,13 @@ declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_phaddsw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_phaddsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    phaddsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    phaddsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -593,12 +624,13 @@ declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_phaddw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_phaddw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    phaddw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    phaddw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -609,12 +641,13 @@ declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_phsubd(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_phsubd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    phsubd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    phsubd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -625,12 +658,13 @@ declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_phsubsw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_phsubsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    phsubsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    phsubsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -641,12 +675,13 @@ declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_phsubw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_phsubw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    phsubw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    phsubw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -659,12 +694,13 @@ declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pmaddubsw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pmaddubsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmaddubsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -675,12 +711,13 @@ declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pmaddwd(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pmaddwd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmaddwd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -691,12 +728,13 @@ declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pmaxsw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pmaxsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmaxsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -707,12 +745,13 @@ declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pmaxub(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pmaxub:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmaxub %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmaxub {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -723,12 +762,13 @@ declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pminsw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pminsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pminsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pminsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -739,12 +779,13 @@ declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pminub(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pminub:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pminub %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pminub {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -755,12 +796,13 @@ declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pmulhrsw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pmulhrsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmulhrsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmulhrsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -771,12 +813,13 @@ declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pmulhuw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pmulhuw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmulhuw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmulhuw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -787,12 +830,13 @@ declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pmulhw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pmulhw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmulhw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmulhw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -803,12 +847,13 @@ declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pmullw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pmullw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmullw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmullw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -819,12 +864,13 @@ declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pmuludq(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pmuludq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmuludq %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -835,12 +881,13 @@ declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_por(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_por:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    por %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.por(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -851,12 +898,13 @@ declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_psadbw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_psadbw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psadbw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psadbw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -867,14 +915,13 @@ declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pshufb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pshufb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pshufb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Reload
-; CHECK-NEXT:    pshufb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -885,13 +932,14 @@ declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pshufw(x86_mmx %a) {
 ; CHECK-LABEL: stack_fold_pshufw:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    pshufw $1, {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    # mm0 = mem[1,0,0,0]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %a, i8 1) nounwind readnone
@@ -902,12 +950,13 @@ declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone
 define x86_mmx @stack_fold_psignb(x86_mmx %a0, x86_mmx %a1) {
 ; CHECK-LABEL: stack_fold_psignb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psignb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psignb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %a0, x86_mmx %a1) nounwind readnone
@@ -918,12 +967,13 @@ declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_psignd(x86_mmx %a0, x86_mmx %a1) {
 ; CHECK-LABEL: stack_fold_psignd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psignd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psignd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %a0, x86_mmx %a1) nounwind readnone
@@ -934,12 +984,13 @@ declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_psignw(x86_mmx %a0, x86_mmx %a1) {
 ; CHECK-LABEL: stack_fold_psignw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psignw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psignw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %a0, x86_mmx %a1) nounwind readnone
@@ -950,12 +1001,13 @@ declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pslld(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pslld:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pslld %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pslld {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -966,12 +1018,13 @@ declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_psllq(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_psllq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psllq %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psllq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -982,12 +1035,13 @@ declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_psllw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_psllw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psllw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psllw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -998,12 +1052,13 @@ declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_psrad(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_psrad:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psrad %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psrad {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1014,12 +1069,13 @@ declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_psraw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_psraw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psraw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psraw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1030,12 +1086,13 @@ declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_psrld(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_psrld:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psrld %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psrld {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1046,12 +1103,13 @@ declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_psrlq(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_psrlq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psrlq %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psrlq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1062,12 +1120,13 @@ declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_psrlw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_psrlw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psrlw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psrlw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1078,12 +1137,13 @@ declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_psubb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_psubb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1094,12 +1154,13 @@ declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_psubd(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_psubd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1110,12 +1171,13 @@ declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_psubq(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_psubq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubq %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1126,12 +1188,13 @@ declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_psubsb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_psubsb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubsb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1142,12 +1205,13 @@ declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_psubsw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_psubsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1158,12 +1222,13 @@ declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_psubusb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_psubusb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubusb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubusb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1174,12 +1239,13 @@ declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_psubusw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_psubusw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubusw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubusw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1190,12 +1256,13 @@ declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_psubw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_psubw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1206,13 +1273,13 @@ declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_punpckhbw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_punpckhbw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    punpckhbw %mm0, %mm1 # mm1 = mm1[4],mm0[4],mm1[5],mm0[5],mm1[6],mm0[6],mm1[7],mm0[7]
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1223,13 +1290,13 @@ declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_punpckhdq(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_punpckhdq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    punpckhdq %mm0, %mm1 # mm1 = mm1[1],mm0[1]
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    # mm0 = mm0[1],mem[1]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1240,13 +1307,13 @@ declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_punpckhwd(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_punpckhwd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    punpckhwd %mm0, %mm1 # mm1 = mm1[2],mm0[2],mm1[3],mm0[3]
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    # mm0 = mm0[2],mem[2],mm0[3],mem[3]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1257,13 +1324,13 @@ declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_punpcklbw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_punpcklbw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1274,13 +1341,13 @@ declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_punpckldq(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_punpckldq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    # mm0 = mm0[0],mem[0]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1291,13 +1358,13 @@ declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_punpcklwd(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_punpcklwd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1]
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    # mm0 = mm0[0],mem[0],mm0[1],mem[1]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1308,12 +1375,13 @@ declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone
 define x86_mmx @stack_fold_pxor(x86_mmx %a, x86_mmx %b) {
 ; CHECK-LABEL: stack_fold_pxor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pxor %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pxor {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
   %2 = call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %a, x86_mmx %b) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/vec_extract-mmx.ll b/llvm/test/CodeGen/X86/vec_extract-mmx.ll
index 672b4591316ce..6fd90243a9303 100644
--- a/llvm/test/CodeGen/X86/vec_extract-mmx.ll
+++ b/llvm/test/CodeGen/X86/vec_extract-mmx.ll
@@ -96,12 +96,13 @@ entry:
 define i32 @test3(x86_mmx %a) nounwind {
 ; X86-LABEL: test3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movd %mm0, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test3:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %mm0, %eax
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %tmp0 = bitcast x86_mmx %a to <2 x i32>
   %tmp1 = extractelement <2 x i32> %tmp0, i32 0
@@ -112,14 +113,12 @@ define i32 @test3(x86_mmx %a) nounwind {
 define i32 @test4(x86_mmx %a) nounwind {
 ; X86-LABEL: test4:
 ; X86:       # %bb.0:
-; X86-NEXT:    movq2dq %mm0, %xmm0
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT:    movd %xmm0, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test4:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq %rdi, %xmm0
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; X64-NEXT:    movd %xmm0, %eax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vec_insert-7.ll b/llvm/test/CodeGen/X86/vec_insert-7.ll
index cea047453de43..aed8782ba40aa 100644
--- a/llvm/test/CodeGen/X86/vec_insert-7.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-7.ll
@@ -9,13 +9,12 @@ define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
 ; X86-LABEL: mmx_movzl:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    movd %eax, %mm0
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mmx_movzl:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    movl $32, %eax
-; X64-NEXT:    movq %rax, %xmm0
 ; X64-NEXT:    retq
   %tmp = bitcast x86_mmx %x to <2 x i32>
   %tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0
diff --git a/llvm/test/CodeGen/X86/vec_insert-mmx.ll b/llvm/test/CodeGen/X86/vec_insert-mmx.ll
index f561a2a20e194..c00417080fe36 100644
--- a/llvm/test/CodeGen/X86/vec_insert-mmx.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-mmx.ll
@@ -6,15 +6,15 @@
 define x86_mmx @t0(i32 %A) nounwind {
 ; X86-LABEL: t0:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm1
-; X86-NEXT:    pxor %mm0, %mm0
-; X86-NEXT:    punpckldq %mm1, %mm0 ## mm0 = mm0[0],mm1[0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t0:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    movd %edi, %xmm0
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; X64-NEXT:    psllq $32, %xmm0
+; X64-NEXT:    movq %xmm0, %rax
 ; X64-NEXT:    retq
   %tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1
   %tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
index 39b2b6225d8b1..dce1aa2bcd1d4 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
@@ -17,16 +17,16 @@ define i64 @test1(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -57,16 +57,16 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -97,16 +97,16 @@ define i64 @test87(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -137,16 +137,16 @@ define i64 @test86(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -177,16 +177,16 @@ define i64 @test85(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -217,16 +217,16 @@ define i64 @test84(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -257,16 +257,16 @@ define i64 @test83(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -297,16 +297,16 @@ define i64 @test82(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -337,16 +337,16 @@ define i64 @test81(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -377,16 +377,16 @@ define i64 @test80(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -417,16 +417,16 @@ define i64 @test79(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -457,16 +457,16 @@ define i64 @test78(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -497,16 +497,16 @@ define i64 @test77(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -537,23 +537,22 @@ define i64 @test76(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP23]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to x86_mmx
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[TMP14]], x86_mmx [[TMP15]])
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <8 x i8>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> [[TMP14]], <1 x i64> [[TMP15]])
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[_MSPROP_VECTOR_PACK]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
@@ -586,23 +585,22 @@ define i64 @test75(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <2 x i32> [[TMP20]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP19]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <2 x i32>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP23]] to <2 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext <2 x i1> [[TMP12]] to <2 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP11]] to x86_mmx
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx [[TMP14]], x86_mmx [[TMP15]])
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> [[TMP14]], <1 x i64> [[TMP15]])
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[_MSPROP_VECTOR_PACK]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP24]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
@@ -635,23 +633,22 @@ define i64 @test74(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP23]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to x86_mmx
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[TMP14]], x86_mmx [[TMP15]])
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <8 x i8>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> [[TMP14]], <1 x i64> [[TMP15]])
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[_MSPROP_VECTOR_PACK]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
@@ -681,17 +678,15 @@ define i64 @test73(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -717,17 +712,15 @@ define i64 @test72(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -751,17 +744,15 @@ define i64 @test72_2(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[TMP10]], i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[TMP1]], i32 0)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[MMX_VAR_I]], i32 0) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -787,13 +778,13 @@ define i64 @test71(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx [[TMP6]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> [[TMP2]], i32 3)
+; CHECK-NEXT:    [[TMP6:%.*]] = or <1 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP4]]
 ;
@@ -815,17 +806,15 @@ define i64 @test70(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -849,17 +838,15 @@ define i64 @test70_2(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[TMP10]], i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[TMP1]], i32 0)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[MMX_VAR_I]], i32 0) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -885,17 +872,15 @@ define i64 @test69(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -921,13 +906,13 @@ define i64 @test68(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx [[TMP6]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> [[TMP2]], i32 3)
+; CHECK-NEXT:    [[TMP6:%.*]] = or <1 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP4]]
 ;
@@ -949,17 +934,15 @@ define i64 @test67(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -985,17 +968,15 @@ define i64 @test66(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -1019,17 +1000,15 @@ define i64 @test66_2(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[TMP10]], i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[TMP1]], i32 0)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[MMX_VAR_I]], i32 0) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -1056,20 +1035,21 @@ define i64 @test65(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1100,20 +1080,21 @@ define i64 @test64(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1144,18 +1125,21 @@ define i64 @test63(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[_MSPROP1]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP6]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext i1 [[TMP9]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx [[TMP6]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> [[TMP3]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP16:%.*]] = or <1 x i64> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP16]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP12]] to i64
 ; CHECK-NEXT:    store i64 [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP5]]
 ;
@@ -1180,20 +1164,21 @@ define i64 @test62(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1224,20 +1209,21 @@ define i64 @test61(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1268,18 +1254,21 @@ define i64 @test60(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[_MSPROP1]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP6]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext i1 [[TMP9]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx [[TMP6]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> [[TMP3]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP16:%.*]] = or <1 x i64> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP16]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP12]] to i64
 ; CHECK-NEXT:    store i64 [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP5]]
 ;
@@ -1304,20 +1293,21 @@ define i64 @test59(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1348,20 +1338,21 @@ define i64 @test58(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1394,16 +1385,16 @@ define i64 @test56(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pxor(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pxor(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1434,16 +1425,16 @@ define i64 @test55(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.por(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1474,16 +1465,16 @@ define i64 @test54(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pandn(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pandn(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1514,16 +1505,16 @@ define i64 @test53(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pand(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1554,16 +1545,16 @@ define i64 @test52(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1592,16 +1583,16 @@ define i64 @test51(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1632,16 +1623,16 @@ define i64 @test50(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1672,22 +1663,22 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP19]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP12]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP18]]
 ;
@@ -1716,16 +1707,16 @@ define i64 @test48(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1756,16 +1747,16 @@ define i64 @test47(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1796,16 +1787,16 @@ define i64 @test46(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1836,16 +1827,16 @@ define i64 @test45(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1872,13 +1863,16 @@ define i64 @test44(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <1 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64> [[MMX_VAR]], <1 x i64> [[MMX_VAR1]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = bitcast <1 x i64> [[_MSPROP3]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP6]] to i64
 ; CHECK-NEXT:    store i64 [[_MSPROP2]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
@@ -1907,16 +1901,16 @@ define i64 @test43(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1947,16 +1941,16 @@ define i64 @test42(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1987,16 +1981,16 @@ define i64 @test41(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2027,16 +2021,16 @@ define i64 @test40(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2067,16 +2061,16 @@ define i64 @test39(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2107,16 +2101,16 @@ define i64 @test38(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2147,16 +2141,16 @@ define i64 @test37(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2185,13 +2179,16 @@ define i64 @test36(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <1 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> [[MMX_VAR]], <1 x i64> [[MMX_VAR1]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = bitcast <1 x i64> [[_MSPROP3]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP6]] to i64
 ; CHECK-NEXT:    store i64 [[_MSPROP2]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
@@ -2218,16 +2215,16 @@ define i64 @test35(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2258,16 +2255,16 @@ define i64 @test34(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2298,16 +2295,16 @@ define i64 @test33(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2338,17 +2335,20 @@ define i64 @test32(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP12]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP4]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP13]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP16:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP16]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP8]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext i1 [[TMP9]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 48
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    store i64 [[TMP11]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i64 [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP14]] to i64
+; CHECK-NEXT:    store i64 [[TMP15]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
@@ -2374,16 +2374,16 @@ define i64 @test31(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2414,16 +2414,16 @@ define i64 @test30(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2454,16 +2454,16 @@ define i64 @test29(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2494,16 +2494,16 @@ define i64 @test28(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2534,16 +2534,16 @@ define i64 @test27(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2574,16 +2574,16 @@ define i64 @test26(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2612,16 +2612,19 @@ define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    store <1 x i64> [[TMP3]], ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0:![0-9]+]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    tail call void @llvm.x86.mmx.movnt.dq(ptr [[P]], x86_mmx [[MMX_VAR_I]]) #[[ATTR2]]
+; CHECK:       8:
+; CHECK-NEXT:    tail call void @llvm.x86.mmx.movnt.dq(ptr [[P]], <1 x i64> [[MMX_VAR_I]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -2641,15 +2644,16 @@ define i32 @test24(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP6]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pmovmskb(x86_mmx [[MMX_VAR_I]]) #[[ATTR2]]
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> [[MMX_VAR_I]]) #[[ATTR2]]
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
@@ -2674,21 +2678,23 @@ define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp #0
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[N]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[D]] to <8 x i8>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP5]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP9]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP8]], 0
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
-; CHECK:       9:
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       11:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    tail call void @llvm.x86.mmx.maskmovq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]], ptr [[P]]) #[[ATTR2]]
+; CHECK:       12:
+; CHECK-NEXT:    tail call void @llvm.x86.mmx.maskmovq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]], ptr [[P]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -2713,16 +2719,16 @@ define i64 @test22(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2750,16 +2756,17 @@ define i64 @test21(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 3) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP13:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP13]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
@@ -2783,16 +2790,17 @@ define i32 @test21_2(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 3) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP13:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP13]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
@@ -2821,13 +2829,14 @@ define i64 @test20(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP9]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
@@ -2851,15 +2860,16 @@ define <2 x double> @test19(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP4]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP7]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP7]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx [[TMP1]]) #[[ATTR5]]
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> [[TMP8]]) #[[ATTR5]]
 ; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[TMP2]]
 ;
@@ -2885,8 +2895,8 @@ define i64 @test18(<2 x double> %a) #0 {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       3:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> [[A]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast x86_mmx [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> [[A]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
@@ -2915,8 +2925,8 @@ define i64 @test17(<2 x double> %a) #0 {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       3:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> [[A]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast x86_mmx [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> [[A]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
@@ -2941,20 +2951,24 @@ define i64 @test16(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[_MSPROP1]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP12]], 0
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]], i8 16)
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64> [[MMX_VAR]], <1 x i64> [[MMX_VAR1]], i8 16)
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
 ; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
@@ -2978,13 +2992,13 @@ define i64 @test15(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx [[TMP1]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64> [[TMP1]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP6]] to <1 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3010,13 +3024,13 @@ define i64 @test14(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx [[TMP1]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64> [[TMP1]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP6]] to <1 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3042,13 +3056,13 @@ define i64 @test13(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx [[TMP1]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64> [[TMP1]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP11]] to <8 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP12]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3077,16 +3091,16 @@ define i64 @test12(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3117,16 +3131,16 @@ define i64 @test11(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3157,16 +3171,16 @@ define i64 @test10(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3197,16 +3211,16 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3237,16 +3251,16 @@ define i64 @test8(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3277,18 +3291,18 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i8> [[TMP18]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP21]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP14]] to <8 x i8>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -3321,16 +3335,16 @@ define i64 @test6(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3361,16 +3375,16 @@ define i64 @test5(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3401,16 +3415,16 @@ define i64 @test4(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3441,16 +3455,16 @@ define i64 @test3(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3481,16 +3495,16 @@ define i64 @test2(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3514,20 +3528,21 @@ define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind #0 {
 ; ALL-NEXT:    cvtpi2ps %mm0, %xmm0
 ; ALL-NEXT:    ret{{[l|q]}}
 ; CHECK-LABEL: define <4 x float> @test89(
-; CHECK-SAME: <4 x float> [[A:%.*]], x86_mmx [[B:%.*]]) #[[ATTR4:[0-9]+]] {
+; CHECK-SAME: <4 x float> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR4:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[C:%.*]] = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> [[A]], x86_mmx [[B]])
+; CHECK:       6:
+; CHECK-NEXT:    [[C:%.*]] = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> [[A]], <1 x i64> [[B]])
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[C]]
 ;
@@ -3561,8 +3576,8 @@ define <1 x i64> @test_mm_insert_pi16(<1 x i64> %a.coerce, i32 %d) nounwind #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[A_COERCE]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A_COERCE]] to x86_mmx
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[TMP6]], 0
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
@@ -3571,8 +3586,8 @@ define <1 x i64> @test_mm_insert_pi16(<1 x i64> %a.coerce, i32 %d) nounwind #0 {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       5:
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx [[TMP0]], i32 [[D]], i32 2)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64> [[TMP8]], i32 [[D]], i32 2)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[TMP9]] to <1 x i64>
 ; CHECK-NEXT:    store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP2]]
 ;
@@ -3591,15 +3606,15 @@ define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[A_COERCE]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A_COERCE]] to x86_mmx
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pextr.w(x86_mmx [[TMP0]], i32 2)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pextr.w(<1 x i64> [[TMP6]], i32 2)
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
index 57d6003b3873f..5197f3277ed80 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -31,12 +31,12 @@ entry:
 }
 
 ; CHECK-LABEL: @Test_ssse3_pmadd_ub_sw(
-; CHECK: or i64
-; CHECK: bitcast i64 {{.*}} to <4 x i16>
+; CHECK: or <1 x i64>
+; CHECK: bitcast <1 x i64> {{.*}} to <4 x i16>
 ; CHECK: icmp ne <4 x i16> {{.*}}, zeroinitializer
 ; CHECK: sext <4 x i1> {{.*}} to <4 x i16>
-; CHECK: bitcast <4 x i16> {{.*}} to i64
-; CHECK: ret x86_mmx
+; CHECK: bitcast <4 x i16> {{.*}} to <1 x i64>
+; CHECK: ret <1 x i64>
 
 
 define <2 x i64> @Test_x86_sse2_psad_bw(<16 x i8> %a, <16 x i8> %b) sanitize_memory {
@@ -60,8 +60,8 @@ entry:
 }
 
 ; CHECK-LABEL: @Test_x86_mmx_psad_bw(
-; CHECK: or i64
+; CHECK: or <1 x i64>
 ; CHECK: icmp ne i64
 ; CHECK: sext i1 {{.*}} to i64
 ; CHECK: lshr i64 {{.*}}, 48
-; CHECK: ret x86_mmx
+; CHECK: ret <1 x i64>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll
index 52acbfe0a0e77..6ae03f288e2c0 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll
@@ -42,9 +42,9 @@ entry:
 ; CHECK: icmp ne {{.*}}[[S]], 0
 ; CHECK: br
 ; CHECK: call void @__msan_warning_noreturn()
-; CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
-; CHECK: store i64 0, {{.*}} @__msan_retval_tls
-; CHECK: ret x86_mmx
+; CHECK: call <1 x i64> @llvm.x86.sse.cvtps2pi
+; CHECK: store <1 x i64> zeroinitializer, {{.*}} @__msan_retval_tls
+; CHECK: ret <1 x i64>
 
 ; avx512 rounding conversion.
 
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
index 4f08ea7c00afe..1289abd63667e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
@@ -48,15 +48,14 @@ entry:
 }
 
 ; CHECK-LABEL: @Test_mmx_packuswb(
-; CHECK-DAG: bitcast i64 {{.*}} to <4 x i16>
-; CHECK-DAG: bitcast i64 {{.*}} to <4 x i16>
+; CHECK-DAG: bitcast <1 x i64> {{.*}} to <4 x i16>
+; CHECK-DAG: bitcast <1 x i64> {{.*}} to <4 x i16>
 ; CHECK-DAG: icmp ne <4 x i16> {{.*}}, zeroinitializer
 ; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i16>
 ; CHECK-DAG: icmp ne <4 x i16> {{.*}}, zeroinitializer
 ; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i16>
-; CHECK-DAG: bitcast <4 x i16> {{.*}} to x86_mmx
-; CHECK-DAG: bitcast <4 x i16> {{.*}} to x86_mmx
-; CHECK-DAG: call x86_mmx @llvm.x86.mmx.packsswb({{.*}}
-; CHECK-DAG: bitcast x86_mmx {{.*}} to i64
-; CHECK-DAG: call x86_mmx @llvm.x86.mmx.packuswb({{.*}}
-; CHECK: ret x86_mmx
+; CHECK-DAG: bitcast <4 x i16> {{.*}} to <1 x i64>
+; CHECK-DAG: bitcast <4 x i16> {{.*}} to <1 x i64>
+; CHECK-DAG: call <1 x i64> @llvm.x86.mmx.packsswb({{.*}}
+; CHECK-DAG: call <1 x i64> @llvm.x86.mmx.packuswb({{.*}}
+; CHECK: ret <1 x i64>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll
index 196285d910a6d..3c6c44194e3ac 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll
@@ -30,11 +30,11 @@ entry:
 
 ; CHECK-LABEL: @test_mmx
 ; CHECK: = icmp ne i64 {{.*}}, 0
-; CHECK: [[C:%.*]] = sext i1 {{.*}} to i64
-; CHECK: [[A:%.*]] = call x86_mmx @llvm.x86.mmx.psll.d(
-; CHECK: [[B:%.*]] = bitcast x86_mmx {{.*}}[[A]] to i64
-; CHECK: = or i64 {{.*}}[[B]], {{.*}}[[C]]
-; CHECK: call x86_mmx @llvm.x86.mmx.psll.d(
+; CHECK: [[B:%.*]] = sext i1 {{.*}} to i64
+; CHECK: [[C:%.*]] = bitcast i64 [[B]] to <1 x i64>
+; CHECK: [[A:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.d(
+; CHECK: = or <1 x i64> {{.*}}[[A]], {{.*}}[[C]]
+; CHECK: call <1 x i64> @llvm.x86.mmx.psll.d(
 ; CHECK: ret i64
 
 
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll b/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll
index 63114288fc581..9fbc39241d8e9 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define i32 @test_upper_x86_mmx_pmovmskb(x86_mmx %a0) {
 ; CHECK-LABEL: @test_upper_x86_mmx_pmovmskb(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx [[A0:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> [[A0:%.*]])
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
@@ -207,16 +207,6 @@ define i32 @undef_x86_avx2_pmovmskb() {
 ; Constant Folding (ZERO -> ZERO)
 ;
 
-define i32 @zero_x86_mmx_pmovmskb() {
-; CHECK-LABEL: @zero_x86_mmx_pmovmskb(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<1 x i64> zeroinitializer to x86_mmx))
-; CHECK-NEXT:    ret i32 [[TMP1]]
-;
-  %1 = bitcast <1 x i64> zeroinitializer to x86_mmx
-  %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1)
-  ret i32 %2
-}
-
 define i32 @zero_x86_sse_movmsk_ps() {
 ; CHECK-LABEL: @zero_x86_sse_movmsk_ps(
 ; CHECK-NEXT:    ret i32 0
@@ -271,7 +261,7 @@ define i32 @zero_x86_avx2_pmovmskb() {
 
 define i32 @fold_x86_mmx_pmovmskb() {
 ; CHECK-LABEL: @fold_x86_mmx_pmovmskb(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<8 x i8> <i8 0, i8 -1, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 0> to x86_mmx))
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> <i64 18084223940296448>)
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %1 = bitcast <8 x i8> <i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256> to x86_mmx
diff --git a/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll b/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll
index 38a7391a1a1e3..d4ec9e3aae679 100644
--- a/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll
@@ -38,38 +38,6 @@ define <1 x i64> @d(i64 %y) {
   ret <1 x i64> %c
 }
 
-define x86_mmx @e(<1 x i64> %y) {
-; CHECK-LABEL: @e(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[Y:%.*]], i64 0
-; CHECK-NEXT:    [[C:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    ret x86_mmx [[C]]
-;
-  %c = bitcast <1 x i64> %y to x86_mmx
-  ret x86_mmx %c
-}
-
-define <1 x i64> @f(x86_mmx %y) {
-; CHECK-LABEL: @f(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast x86_mmx [[Y:%.*]] to i64
-; CHECK-NEXT:    [[C:%.*]] = insertelement <1 x i64> poison, i64 [[TMP1]], i64 0
-; CHECK-NEXT:    ret <1 x i64> [[C]]
-;
-  %c = bitcast x86_mmx %y to <1 x i64>
-  ret <1 x i64> %c
-}
-
-define double @g(x86_mmx %x) {
-; CHECK-LABEL: @g(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast x86_mmx [[X:%.*]] to double
-; CHECK-NEXT:    ret double [[TMP0]]
-;
-entry:
-  %0 = bitcast x86_mmx %x to <1 x i64>
-  %1 = bitcast <1 x i64> %0 to double
-  ret double %1
-}
-
 ; FP source is ok.
 
 define <3 x i64> @bitcast_inselt_undef(double %x, i32 %idx) {
@@ -137,19 +105,6 @@ define <3 x i64> @bitcast_inselt_undef_vec_src(<2 x i32> %x, i32 %idx) {
   ret <3 x i64> %i
 }
 
-; Negative test - source type must be scalar
-
-define <3 x i64> @bitcast_inselt_undef_from_mmx(x86_mmx %x, i32 %idx) {
-; CHECK-LABEL: @bitcast_inselt_undef_from_mmx(
-; CHECK-NEXT:    [[XB:%.*]] = bitcast x86_mmx [[X:%.*]] to i64
-; CHECK-NEXT:    [[I:%.*]] = insertelement <3 x i64> poison, i64 [[XB]], i32 [[IDX:%.*]]
-; CHECK-NEXT:    ret <3 x i64> [[I]]
-;
-  %xb = bitcast x86_mmx %x to i64
-  %i = insertelement <3 x i64> poison, i64 %xb, i32 %idx
-  ret <3 x i64> %i
-}
-
 ; Reduce number of casts
 
 define <2 x i64> @PR45748(double %x, double %y) {
diff --git a/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll b/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll
index 8b8325b147263..f787b3c4cc9ac 100644
--- a/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll
@@ -38,37 +38,6 @@ define <1 x i64> @d(i64 %y) {
   ret <1 x i64> %c
 }
 
-define x86_mmx @e(<1 x i64> %y) {
-; CHECK-LABEL: @e(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[Y:%.*]], i64 0
-; CHECK-NEXT:    [[C:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    ret x86_mmx [[C]]
-;
-  %c = bitcast <1 x i64> %y to x86_mmx
-  ret x86_mmx %c
-}
-
-define <1 x i64> @f(x86_mmx %y) {
-; CHECK-LABEL: @f(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast x86_mmx [[Y:%.*]] to i64
-; CHECK-NEXT:    [[C:%.*]] = insertelement <1 x i64> poison, i64 [[TMP1]], i64 0
-; CHECK-NEXT:    ret <1 x i64> [[C]]
-;
-  %c = bitcast x86_mmx %y to <1 x i64>
-  ret <1 x i64> %c
-}
-
-define double @g(x86_mmx %x) {
-; CHECK-LABEL: @g(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast x86_mmx [[X:%.*]] to double
-; CHECK-NEXT:    ret double [[TMP0]]
-;
-entry:
-  %0 = bitcast x86_mmx %x to <1 x i64>
-  %1 = bitcast <1 x i64> %0 to double
-  ret double %1
-}
 
 ; FP source is ok.
 
@@ -137,19 +106,6 @@ define <3 x i64> @bitcast_inselt_undef_vec_src(<2 x i32> %x, i32 %idx) {
   ret <3 x i64> %i
 }
 
-; Negative test - source type must be scalar
-
-define <3 x i64> @bitcast_inselt_undef_from_mmx(x86_mmx %x, i32 %idx) {
-; CHECK-LABEL: @bitcast_inselt_undef_from_mmx(
-; CHECK-NEXT:    [[XB:%.*]] = bitcast x86_mmx [[X:%.*]] to i64
-; CHECK-NEXT:    [[I:%.*]] = insertelement <3 x i64> undef, i64 [[XB]], i32 [[IDX:%.*]]
-; CHECK-NEXT:    ret <3 x i64> [[I]]
-;
-  %xb = bitcast x86_mmx %x to i64
-  %i = insertelement <3 x i64> undef, i64 %xb, i32 %idx
-  ret <3 x i64> %i
-}
-
 ; Reduce number of casts
 
 define <2 x i64> @PR45748(double %x, double %y) {
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll b/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
index d4c49faf91b09..dd75560e25ced 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
@@ -335,19 +335,6 @@ define { i64, i64 } @test_load_struct() {
   ret { i64, i64 } %v
 }
 
- at m64 = internal constant [2 x i64] zeroinitializer
- at idx = external global i32
-
-; This should not try to create an x86_mmx null value.
-define x86_mmx @load_mmx() {
-; CHECK-LABEL: @load_mmx(
-; CHECK-NEXT:    [[TEMP:%.*]] = load x86_mmx, ptr getelementptr ([2 x i64], ptr @m64, i64 0, i64 ptrtoint (ptr @idx to i64)), align 8
-; CHECK-NEXT:    ret x86_mmx [[TEMP]]
-;
-  %temp = load x86_mmx, ptr getelementptr ([2 x i64], ptr @m64, i64 0, i64 ptrtoint (ptr @idx to i64))
-  ret x86_mmx %temp
-}
-
 @g_offset = external global i64
 
 @g_neg_one_vec = constant <4 x i8> <i8 -1, i8 -1, i8 -1, i8 -1>
diff --git a/llvm/test/Transforms/LoopUnroll/X86/mmx.ll b/llvm/test/Transforms/LoopUnroll/X86/mmx.ll
deleted file mode 100644
index b460b79d0640a..0000000000000
--- a/llvm/test/Transforms/LoopUnroll/X86/mmx.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -S -passes=loop-unroll | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define x86_mmx @f() #0 {
-; CHECK-LABEL: define x86_mmx @f
-; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[ADD_7:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ADD_6:%.*]] = add i32 [[PHI]], 7
-; CHECK-NEXT:    [[ADD_7]] = add i32 [[PHI]], 8
-; CHECK-NEXT:    [[CMP_7:%.*]] = icmp eq i32 [[ADD_6]], 0
-; CHECK-NEXT:    br i1 [[CMP_7]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[RET:%.*]] = phi x86_mmx [ undef, [[FOR_BODY]] ]
-; CHECK-NEXT:    ret x86_mmx [[RET]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %phi = phi i32 [ 1, %entry ], [ %add, %for.body ]
-  %add = add i32 %phi, 1
-  %cmp = icmp eq i32 %phi, 0
-  br i1 %cmp, label %exit, label %for.body
-
-exit:                                             ; preds = %for.body
-  %ret = phi x86_mmx [ undef, %for.body ]
-  ret x86_mmx %ret
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll b/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll
index 7476ddb0fb873..19ca68fc9cb2c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll
@@ -4,68 +4,6 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define void @test1(x86_mmx %a, x86_mmx %b, ptr %ptr) {
-; Ensure we can handle x86_mmx values which are primitive and can be bitcast
-; with integer types but can't be put into a vector.
-;
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_CAST:%.*]] = bitcast x86_mmx [[A:%.*]] to i64
-; CHECK-NEXT:    [[B_CAST:%.*]] = bitcast x86_mmx [[B:%.*]] to i64
-; CHECK-NEXT:    [[A_AND:%.*]] = and i64 [[A_CAST]], 42
-; CHECK-NEXT:    [[B_AND:%.*]] = and i64 [[B_CAST]], 42
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[PTR:%.*]], i32 1
-; CHECK-NEXT:    store i64 [[A_AND]], ptr [[PTR]], align 8
-; CHECK-NEXT:    store i64 [[B_AND]], ptr [[GEP]], align 8
-; CHECK-NEXT:    ret void
-;
-entry:
-  %a.cast = bitcast x86_mmx %a to i64
-  %b.cast = bitcast x86_mmx %b to i64
-  %a.and = and i64 %a.cast, 42
-  %b.and = and i64 %b.cast, 42
-  %gep = getelementptr i64, ptr %ptr, i32 1
-  store i64 %a.and, ptr %ptr
-  store i64 %b.and, ptr %gep
-  ret void
-}
-
-define void @test2(x86_mmx %a, x86_mmx %b) {
-; Same as @test1 but using phi-input vectorization instead of store
-; vectorization.
-;
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[EXIT:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[A_CAST:%.*]] = bitcast x86_mmx [[A:%.*]] to i64
-; CHECK-NEXT:    [[B_CAST:%.*]] = bitcast x86_mmx [[B:%.*]] to i64
-; CHECK-NEXT:    [[A_AND:%.*]] = and i64 [[A_CAST]], 42
-; CHECK-NEXT:    [[B_AND:%.*]] = and i64 [[B_CAST]], 42
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[A_PHI:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[A_AND]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[B_PHI:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[B_AND]], [[IF_THEN]] ]
-; CHECK-NEXT:    tail call void @f(i64 [[A_PHI]], i64 [[B_PHI]])
-; CHECK-NEXT:    ret void
-;
-entry:
-  br i1 undef, label %if.then, label %exit
-
-if.then:
-  %a.cast = bitcast x86_mmx %a to i64
-  %b.cast = bitcast x86_mmx %b to i64
-  %a.and = and i64 %a.cast, 42
-  %b.and = and i64 %b.cast, 42
-  br label %exit
-
-exit:
-  %a.phi = phi i64 [ 0, %entry ], [ %a.and, %if.then ]
-  %b.phi = phi i64 [ 0, %entry ], [ %b.and, %if.then ]
-  tail call void @f(i64 %a.phi, i64 %b.phi)
-  ret void
-}
-
 define i8 @test3(ptr %addr) {
 ; Check that we do not vectorize types that are padded to a bigger ones.
 ;
diff --git a/llvm/test/Transforms/SROA/pr57796.ll b/llvm/test/Transforms/SROA/pr57796.ll
index 1bf1ad7ee934a..dbcb6d0784971 100644
--- a/llvm/test/Transforms/SROA/pr57796.ll
+++ b/llvm/test/Transforms/SROA/pr57796.ll
@@ -17,9 +17,9 @@ define void @foo() {
 ; CHECK-NEXT:    [[CALL_I:%.*]] = call align 32 ptr @value_set_type(ptr align 32 [[REF_TMP_I]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[CALL_I]], align 32
 ; CHECK-NEXT:    [[REF_TMP_SROA_0_0_VEC_EXTRACT:%.*]] = shufflevector <32 x i8> [[TMP0]], <32 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[REF_TMP_SROA_0_0_VEC_EXTRACT]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 0)
-; CHECK-NEXT:    store x86_mmx [[TMP2]], ptr @A, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[REF_TMP_SROA_0_0_VEC_EXTRACT]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP1]], i8 0)
+; CHECK-NEXT:    store <1 x i64> [[TMP2]], ptr @A, align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/tools/llvm-c-test/echo.cpp b/llvm/tools/llvm-c-test/echo.cpp
index c5ae051c0a301..1e78637bf47ca 100644
--- a/llvm/tools/llvm-c-test/echo.cpp
+++ b/llvm/tools/llvm-c-test/echo.cpp
@@ -153,8 +153,6 @@ struct TypeCloner {
         return LLVMMetadataTypeInContext(Ctx);
       case LLVMX86_AMXTypeKind:
         return LLVMX86AMXTypeInContext(Ctx);
-      case LLVMX86_MMXTypeKind:
-        return LLVMX86MMXTypeInContext(Ctx);
       case LLVMTokenTypeKind:
         return LLVMTokenTypeInContext(Ctx);
       case LLVMTargetExtTypeKind: {
diff --git a/llvm/tools/llvm-stress/llvm-stress.cpp b/llvm/tools/llvm-stress/llvm-stress.cpp
index 758643f1414c1..80fb21038d304 100644
--- a/llvm/tools/llvm-stress/llvm-stress.cpp
+++ b/llvm/tools/llvm-stress/llvm-stress.cpp
@@ -173,8 +173,6 @@ struct Modifier {
         Ty = Type::getX86_FP80Ty(Context);
       else if (Arg == "ppc_fp128")
         Ty = Type::getPPC_FP128Ty(Context);
-      else if (Arg == "x86_mmx")
-        Ty = Type::getX86_MMXTy(Context);
       else if (Arg.starts_with("i")) {
         unsigned N = 0;
         Arg.drop_front().getAsInteger(10, N);
@@ -294,11 +292,7 @@ struct Modifier {
   /// Pick a random vector type.
   Type *pickVectorType(VectorType *VTy = nullptr) {
 
-    // Vectors of x86mmx are illegal; keep trying till we get something else.
-    Type *Ty;
-    do {
-      Ty = pickScalarType();
-    } while (Ty->isX86_MMXTy());
+    Type *Ty = pickScalarType();
 
     if (VTy)
       return VectorType::get(Ty, VTy->getElementCount());
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
index 4c1e9a9acb29a..44b25035dde2c 100644
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -205,7 +205,6 @@ TEST(InstructionsTest, CastInst) {
   Type *Int64Ty = Type::getInt64Ty(C);
   Type *V8x8Ty = FixedVectorType::get(Int8Ty, 8);
   Type *V8x64Ty = FixedVectorType::get(Int64Ty, 8);
-  Type *X86MMXTy = Type::getX86_MMXTy(C);
 
   Type *HalfTy = Type::getHalfTy(C);
   Type *FloatTy = Type::getFloatTy(C);
@@ -248,9 +247,6 @@ TEST(InstructionsTest, CastInst) {
   EXPECT_EQ(CastInst::Trunc, CastInst::getCastOpcode(c64, true, V8x8Ty, true));
   EXPECT_EQ(CastInst::SExt, CastInst::getCastOpcode(c8, true, V8x64Ty, true));
 
-  EXPECT_FALSE(CastInst::isBitCastable(V8x8Ty, X86MMXTy));
-  EXPECT_FALSE(CastInst::isBitCastable(X86MMXTy, V8x8Ty));
-  EXPECT_FALSE(CastInst::isBitCastable(Int64Ty, X86MMXTy));
   EXPECT_FALSE(CastInst::isBitCastable(V8x64Ty, V8x8Ty));
   EXPECT_FALSE(CastInst::isBitCastable(V8x8Ty, V8x64Ty));
 
@@ -1745,7 +1741,7 @@ TEST(InstructionsTest, AllocaInst) {
         %A = alloca i32, i32 1
         %B = alloca i32, i32 4
         %C = alloca i32, i32 %n
-        %D = alloca <8 x double>
+        %D = alloca double
         %E = alloca <vscale x 8 x double>
         %F = alloca [2 x half]
         %G = alloca [2 x [3 x i128]]
@@ -1771,7 +1767,8 @@ TEST(InstructionsTest, AllocaInst) {
   EXPECT_EQ(A.getAllocationSizeInBits(DL), TypeSize::getFixed(32));
   EXPECT_EQ(B.getAllocationSizeInBits(DL), TypeSize::getFixed(128));
   EXPECT_FALSE(C.getAllocationSizeInBits(DL));
-  EXPECT_EQ(D.getAllocationSizeInBits(DL), TypeSize::getFixed(512));
+  EXPECT_EQ(DL.getTypeSizeInBits(D.getAllocatedType()), TypeSize::getFixed(64));
+  EXPECT_EQ(D.getAllocationSizeInBits(DL), TypeSize::getFixed(64));
   EXPECT_EQ(E.getAllocationSizeInBits(DL), TypeSize::getScalable(512));
   EXPECT_EQ(F.getAllocationSizeInBits(DL), TypeSize::getFixed(32));
   EXPECT_EQ(G.getAllocationSizeInBits(DL), TypeSize::getFixed(768));
diff --git a/mlir/docs/Dialects/LLVM.md b/mlir/docs/Dialects/LLVM.md
index bc0f484108fac..fadc81b567b4e 100644
--- a/mlir/docs/Dialects/LLVM.md
+++ b/mlir/docs/Dialects/LLVM.md
@@ -240,8 +240,6 @@ dialect as there is no corresponding built-in type.
 The following non-parametric types derived from the LLVM IR are available in the
 LLVM dialect:
 
--   `!llvm.x86_mmx` (`LLVMX86MMXType`) - value held in an MMX register on x86
-    machine.
 -   `!llvm.ppc_fp128` (`LLVMPPCFP128Type`) - 128-bit floating-point value (two
     64 bits).
 -   `!llvm.token` (`LLVMTokenType`) - a non-inspectable value associated with an
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
index 93733ccd4929a..1befdfa74f67c 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
@@ -67,7 +67,6 @@ namespace LLVM {
 
 DEFINE_TRIVIAL_LLVM_TYPE(LLVMVoidType, "llvm.void");
 DEFINE_TRIVIAL_LLVM_TYPE(LLVMPPCFP128Type, "llvm.ppc_fp128");
-DEFINE_TRIVIAL_LLVM_TYPE(LLVMX86MMXType, "llvm.x86_mmx");
 DEFINE_TRIVIAL_LLVM_TYPE(LLVMTokenType, "llvm.token");
 DEFINE_TRIVIAL_LLVM_TYPE(LLVMLabelType, "llvm.label");
 DEFINE_TRIVIAL_LLVM_TYPE(LLVMMetadataType, "llvm.metadata");
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 73f92e5165a8c..90610118a45cd 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -3180,7 +3180,6 @@ void LLVMDialect::initialize() {
   // clang-format off
   addTypes<LLVMVoidType,
            LLVMPPCFP128Type,
-           LLVMX86MMXType,
            LLVMTokenType,
            LLVMLabelType,
            LLVMMetadataType,
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp
index 9810d4d643677..c4708d826f2b3 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp
@@ -35,7 +35,6 @@ static StringRef getTypeKeyword(Type type) {
   return TypeSwitch<Type, StringRef>(type)
       .Case<LLVMVoidType>([&](Type) { return "void"; })
       .Case<LLVMPPCFP128Type>([&](Type) { return "ppc_fp128"; })
-      .Case<LLVMX86MMXType>([&](Type) { return "x86_mmx"; })
       .Case<LLVMTokenType>([&](Type) { return "token"; })
       .Case<LLVMLabelType>([&](Type) { return "label"; })
       .Case<LLVMMetadataType>([&](Type) { return "metadata"; })
@@ -309,7 +308,6 @@ static Type dispatchParse(AsmParser &parser, bool allowAny = true) {
   return StringSwitch<function_ref<Type()>>(key)
       .Case("void", [&] { return LLVMVoidType::get(ctx); })
       .Case("ppc_fp128", [&] { return LLVMPPCFP128Type::get(ctx); })
-      .Case("x86_mmx", [&] { return LLVMX86MMXType::get(ctx); })
       .Case("token", [&] { return LLVMTokenType::get(ctx); })
       .Case("label", [&] { return LLVMLabelType::get(ctx); })
       .Case("metadata", [&] { return LLVMMetadataType::get(ctx); })
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
index cf3f38b710130..dc7aef8ef7f85 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
@@ -780,8 +780,7 @@ bool mlir::LLVM::isCompatibleOuterType(Type type) {
       LLVMFixedVectorType,
       LLVMScalableVectorType,
       LLVMTargetExtType,
-      LLVMVoidType,
-      LLVMX86MMXType
+      LLVMVoidType
     >(type)) {
     // clang-format on
     return true;
@@ -843,8 +842,7 @@ static bool isCompatibleImpl(Type type, DenseSet<Type> &compatibleTypes) {
             LLVMMetadataType,
             LLVMPPCFP128Type,
             LLVMTokenType,
-            LLVMVoidType,
-            LLVMX86MMXType
+            LLVMVoidType
           >([](Type) { return true; })
           // clang-format on
           .Default([](Type) { return false; });
@@ -986,8 +984,7 @@ llvm::TypeSize mlir::LLVM::getPrimitiveTypeSizeInBits(Type type) {
       .Case<BFloat16Type, Float16Type>(
           [](Type) { return llvm::TypeSize::getFixed(16); })
       .Case<Float32Type>([](Type) { return llvm::TypeSize::getFixed(32); })
-      .Case<Float64Type, LLVMX86MMXType>(
-          [](Type) { return llvm::TypeSize::getFixed(64); })
+      .Case<Float64Type>([](Type) { return llvm::TypeSize::getFixed(64); })
       .Case<Float80Type>([](Type) { return llvm::TypeSize::getFixed(80); })
       .Case<Float128Type>([](Type) { return llvm::TypeSize::getFixed(128); })
       .Case<IntegerType>([](IntegerType intTy) {
diff --git a/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp b/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp
index a4db958207756..db184ae8e6e83 100644
--- a/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp
+++ b/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp
@@ -65,8 +65,6 @@ class TypeFromLLVMIRTranslatorImpl {
       return Float80Type::get(&context);
     if (type->isPPC_FP128Ty())
       return LLVM::LLVMPPCFP128Type::get(&context);
-    if (type->isX86_MMXTy())
-      return LLVM::LLVMX86MMXType::get(&context);
     if (type->isLabelTy())
       return LLVM::LLVMLabelType::get(&context);
     if (type->isMetadataTy())
diff --git a/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp b/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp
index 6d8b415ff09dc..6591502723880 100644
--- a/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp
+++ b/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp
@@ -58,9 +58,6 @@ class TypeToLLVMIRTranslatorImpl {
             .Case([this](LLVM::LLVMPPCFP128Type) {
               return llvm::Type::getPPC_FP128Ty(context);
             })
-            .Case([this](LLVM::LLVMX86MMXType) {
-              return llvm::Type::getX86_MMXTy(context);
-            })
             .Case([this](LLVM::LLVMTokenType) {
               return llvm::Type::getTokenTy(context);
             })
diff --git a/mlir/test/Dialect/LLVMIR/types.mlir b/mlir/test/Dialect/LLVMIR/types.mlir
index 2dd292408fa60..42d370a5477c2 100644
--- a/mlir/test/Dialect/LLVMIR/types.mlir
+++ b/mlir/test/Dialect/LLVMIR/types.mlir
@@ -6,8 +6,6 @@ func.func @primitive() {
   "some.op"() : () -> !llvm.void
   // CHECK: !llvm.ppc_fp128
   "some.op"() : () -> !llvm.ppc_fp128
-  // CHECK: !llvm.x86_mmx
-  "some.op"() : () -> !llvm.x86_mmx
   // CHECK: !llvm.token
   "some.op"() : () -> !llvm.token
   // CHECK: !llvm.label
diff --git a/mlir/test/Target/LLVMIR/llvmir-types.mlir b/mlir/test/Target/LLVMIR/llvmir-types.mlir
index c85fa0101c00d..3e533211b0d0c 100644
--- a/mlir/test/Target/LLVMIR/llvmir-types.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-types.mlir
@@ -20,8 +20,6 @@ llvm.func @return_fp128() -> f128
 llvm.func @return_x86_fp80() -> f80
 // CHECK: declare ppc_fp128 @return_ppc_fp128()
 llvm.func @return_ppc_fp128() -> !llvm.ppc_fp128
-// CHECK: declare x86_mmx @return_x86_mmx()
-llvm.func @return_x86_mmx() -> !llvm.x86_mmx
 
 //
 // Functions.

>From fd17064dfa08c252c12b71167bc588f57e6a082c Mon Sep 17 00:00:00 2001
From: Zibi Sarbinowski <zibi at ca.ibm.com>
Date: Thu, 25 Jul 2024 09:25:11 -0400
Subject: [PATCH 135/259] Fix fs.op.copy_file/copy_file_large.pass.cpp on z/OS
 (#100382)

This PR is required to fix copy_file_large.pass.cpp which is failing on
z/OS in all ASCII variations. The problem is that a destination file is
opened in binary mode and auto-conversion does not happen when this lit
is compiled with -fzos-le-char-mode=ascii.

In addition opening a destination file will match a text mode of
`fopen()` of a source file.
---
 .../fs.op.funcs/fs.op.copy_file/copy_file_large.pass.cpp        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_file/copy_file_large.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_file/copy_file_large.pass.cpp
index db15ee193ad2a..53bf828fc120b 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_file/copy_file_large.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_file/copy_file_large.pass.cpp
@@ -72,7 +72,7 @@ static void large_file() {
   // the data at the end of the source file.
   std::string out_data(additional_size, 'z');
   {
-    std::FILE* dest_file = std::fopen(dest.string().c_str(), "rb");
+    std::FILE* dest_file = std::fopen(dest.string().c_str(), "r");
     assert(dest_file != nullptr);
     assert(std::fseek(dest_file, sendfile_size_limit, SEEK_SET) == 0);
     assert(std::fread(&out_data[0], sizeof(out_data[0]), additional_size, dest_file) == additional_size);

>From ca00cec997c2a22dd6603cddb8bab789e1b01d34 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Thu, 25 Jul 2024 21:29:18 +0800
Subject: [PATCH 136/259] [ConstantRange] Infer nonnegative for mul nuw nsw
 (#100554)

Alive2: https://alive2.llvm.org/ce/z/byzmsV
---
 llvm/lib/IR/ConstantRange.cpp                 | 11 +++++++
 .../CorrelatedValuePropagation/mul.ll         | 32 +++++++++++++++++++
 llvm/unittests/IR/ConstantRangeTest.cpp       | 18 +++++++++++
 3 files changed, 61 insertions(+)

diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index 0ead677422803..6068540cf08da 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -1242,6 +1242,17 @@ ConstantRange::multiplyWithNoWrap(const ConstantRange &Other,
   if (NoWrapKind & OverflowingBinaryOperator::NoUnsignedWrap)
     Result = Result.intersectWith(umul_sat(Other), RangeType);
 
+  // mul nsw nuw X, Y s>= 0 if X s> 1 or Y s> 1
+  if ((NoWrapKind == (OverflowingBinaryOperator::NoSignedWrap |
+                      OverflowingBinaryOperator::NoUnsignedWrap)) &&
+      !Result.isAllNonNegative()) {
+    if (getSignedMin().sgt(1) || Other.getSignedMin().sgt(1))
+      Result = Result.intersectWith(
+          getNonEmpty(APInt::getZero(getBitWidth()),
+                      APInt::getSignedMinValue(getBitWidth())),
+          RangeType);
+  }
+
   return Result;
 }
 
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/mul.ll b/llvm/test/Transforms/CorrelatedValuePropagation/mul.ll
index 086043d4b7c1b..75ad9274eec85 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/mul.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/mul.ll
@@ -224,3 +224,35 @@ define i1 @one_bit(i1 %a, i1 %b) {
   %mul = mul i1 %a, %b
   ret i1 %mul
 }
+
+define i1 @test_mul_nuw_nsw_nneg(i32 %x, i32 range(i32 3, 2147483648) %y) {
+; CHECK-LABEL: @test_mul_nuw_nsw_nneg(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  %mul = mul nuw nsw i32 %x, %y
+  %cmp = icmp sgt i32 %mul, -1
+  ret i1 %cmp
+}
+
+define i1 @test_mul_nuw_nsw_nneg_complex(i32 %x, i32 noundef %y, i32 %z) {
+; CHECK-LABEL: @test_mul_nuw_nsw_nneg_complex(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[Y:%.*]], 2
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i32 3, i32 4
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[CMP1]], i32 [[Y]], i32 [[SEL1]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[X]], [[SEL2]]
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  %cmp1 = icmp sgt i32 %y, 2
+  %cmp2 = icmp eq i32 %x, 0
+  %sel1 = select i1 %cmp2, i32 3, i32 4
+  %sel2 = select i1 %cmp1, i32 %y, i32 %sel1
+  %mul = mul nuw nsw i32 %x, %sel2
+  %cmp3 = icmp sgt i32 %mul, -1
+  ret i1 %cmp3
+}
diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp
index 0181e2ce6ac92..7977a78a7d3ec 100644
--- a/llvm/unittests/IR/ConstantRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantRangeTest.cpp
@@ -1070,6 +1070,24 @@ TEST_F(ConstantRangeTest, MultiplyWithNoWrap) {
                 .multiplyWithNoWrap(ConstantRange(APInt(8, 100), APInt(8, 121)),
                                     OBO::NoSignedWrap),
             ConstantRange::getEmpty(8));
+  EXPECT_TRUE(ConstantRange::getFull(8)
+                  .multiplyWithNoWrap(ConstantRange(APInt(8, 2), APInt(8, 128)),
+                                      OBO::NoUnsignedWrap | OBO::NoSignedWrap)
+                  .isAllNonNegative());
+  EXPECT_TRUE(ConstantRange(APInt(8, 2), APInt(8, 128))
+                  .multiplyWithNoWrap(ConstantRange::getFull(8),
+                                      OBO::NoUnsignedWrap | OBO::NoSignedWrap)
+                  .isAllNonNegative());
+  EXPECT_FALSE(
+      ConstantRange::getFull(8)
+          .multiplyWithNoWrap(ConstantRange(APInt(8, 1), APInt(8, 128)),
+                              OBO::NoUnsignedWrap | OBO::NoSignedWrap)
+          .isAllNonNegative());
+  EXPECT_FALSE(
+      ConstantRange::getFull(8)
+          .multiplyWithNoWrap(ConstantRange(APInt(8, 2), APInt(8, 128)),
+                              OBO::NoSignedWrap)
+          .isAllNonNegative());
 
   TestBinaryOpExhaustive(
       [](const ConstantRange &CR1, const ConstantRange &CR2) {

>From 63a7ed45e89799adcca5c0493e184ed946c4a470 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Mon, 22 Jul 2024 16:35:21 +0200
Subject: [PATCH 137/259] [clang][Interp] Reject non-literal values

---
 clang/lib/AST/Interp/Compiler.cpp    | 14 ++++++++++++
 clang/lib/AST/Interp/Compiler.h      |  2 ++
 clang/lib/AST/Interp/Interp.h        | 33 ++++++++++++++++++++++++++++
 clang/lib/AST/Interp/Opcodes.td      |  5 +++++
 clang/test/SemaCXX/new-delete-0x.cpp |  1 +
 5 files changed, 55 insertions(+)

diff --git a/clang/lib/AST/Interp/Compiler.cpp b/clang/lib/AST/Interp/Compiler.cpp
index 11ca7f210d2d5..c07c1062f68a1 100644
--- a/clang/lib/AST/Interp/Compiler.cpp
+++ b/clang/lib/AST/Interp/Compiler.cpp
@@ -3256,6 +3256,9 @@ bool Compiler<Emitter>::visitInitializer(const Expr *E) {
   if (E->containsErrors())
     return this->emitError(E);
 
+  if (!this->checkLiteralType(E))
+    return false;
+
   OptionScope<Emitter> Scope(this, /*NewDiscardResult=*/false,
                              /*NewInitializing=*/true);
   return this->Visit(E);
@@ -4698,6 +4701,17 @@ bool Compiler<Emitter>::emitLambdaStaticInvokerBody(const CXXMethodDecl *MD) {
   return this->emitRetVoid(MD);
 }
 
+template <class Emitter>
+bool Compiler<Emitter>::checkLiteralType(const Expr *E) {
+  if (Ctx.getLangOpts().CPlusPlus23)
+    return true;
+
+  if (!E->isPRValue() || E->getType()->isLiteralType(Ctx.getASTContext()))
+    return true;
+
+  return this->emitCheckLiteralType(E->getType().getTypePtr(), E);
+}
+
 template <class Emitter>
 bool Compiler<Emitter>::visitFunc(const FunctionDecl *F) {
   // Classify the return type.
diff --git a/clang/lib/AST/Interp/Compiler.h b/clang/lib/AST/Interp/Compiler.h
index d22b29d29a92d..6bc9985fe7232 100644
--- a/clang/lib/AST/Interp/Compiler.h
+++ b/clang/lib/AST/Interp/Compiler.h
@@ -359,6 +359,8 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
                              const QualType DerivedType);
   bool emitLambdaStaticInvokerBody(const CXXMethodDecl *MD);
 
+  bool checkLiteralType(const Expr *E);
+
 protected:
   /// Variable to storage mapping.
   llvm::DenseMap<const ValueDecl *, Scope::Local> Locals;
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 81c1b6a52189c..22a86abf38225 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -2970,6 +2970,39 @@ static inline bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm) {
                              BlockDesc, Source);
 }
 
+inline bool CheckLiteralType(InterpState &S, CodePtr OpPC, const Type *T) {
+  assert(T);
+  assert(!S.getLangOpts().CPlusPlus23);
+
+  // C++1y: A constant initializer for an object o [...] may also invoke
+  // constexpr constructors for o and its subobjects even if those objects
+  // are of non-literal class types.
+  //
+  // C++11 missed this detail for aggregates, so classes like this:
+  //   struct foo_t { union { int i; volatile int j; } u; };
+  // are not (obviously) initializable like so:
+  //   __attribute__((__require_constant_initialization__))
+  //   static const foo_t x = {{0}};
+  // because "i" is a subobject with non-literal initialization (due to the
+  // volatile member of the union). See:
+  //   http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1677
+  // Therefore, we use the C++1y behavior.
+
+  if (S.EvaluatingDecl)
+    return true;
+
+  if (S.Current->getFunction() && S.Current->getFunction()->isConstructor() &&
+      S.Current->getThis().getDeclDesc()->asDecl() == S.EvaluatingDecl)
+    return true;
+
+  const Expr *E = S.Current->getExpr(OpPC);
+  if (S.getLangOpts().CPlusPlus11)
+    S.FFDiag(E, diag::note_constexpr_nonliteral) << E->getType();
+  else
+    S.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Read opcode arguments
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td
index 9f29fa9272711..eeb9cb2e933a6 100644
--- a/clang/lib/AST/Interp/Opcodes.td
+++ b/clang/lib/AST/Interp/Opcodes.td
@@ -67,6 +67,7 @@ def ArgVarDecl : ArgType { let Name = "const VarDecl*"; }
 def ArgDesc : ArgType { let Name = "const Descriptor *"; }
 def ArgPrimType : ArgType { let Name = "PrimType"; }
 def ArgEnumDecl : ArgType { let Name = "const EnumDecl *"; }
+def ArgTypePtr : ArgType { let Name = "const Type *"; }
 
 //===----------------------------------------------------------------------===//
 // Classes of types instructions operate on.
@@ -396,6 +397,10 @@ def CheckEnumValue : Opcode {
   let HasGroup = 1;
 }
 
+def CheckLiteralType : Opcode {
+  let Args = [ArgTypePtr];
+}
+
 // [] -> [Value]
 def GetGlobal : AccessOpcode;
 def GetGlobalUnchecked : AccessOpcode;
diff --git a/clang/test/SemaCXX/new-delete-0x.cpp b/clang/test/SemaCXX/new-delete-0x.cpp
index a4b43308d010f..aa9ce37c8e3e4 100644
--- a/clang/test/SemaCXX/new-delete-0x.cpp
+++ b/clang/test/SemaCXX/new-delete-0x.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s -triple=i686-pc-linux-gnu -std=c++11
+// RUN: %clang_cc1 -fsyntax-only -verify %s -triple=i686-pc-linux-gnu -std=c++11 -fexperimental-new-constant-interpreter
 
 using size_t = decltype(sizeof(0));
 struct noreturn_t {} constexpr noreturn = {};

>From 430cf6537b1e872faabf7b6859b1f7d556dc0da9 Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev at amd.com>
Date: Thu, 25 Jul 2024 16:32:19 +0300
Subject: [PATCH 138/259] [AMDGPU][NFCI] Declare offset0/1 operands to be i32.
 (#100560)

Being of type i8 makes them signed, which they aren't, and requires
extra work masking them on verbalisation.

Part of <https://github.com/llvm/llvm-project/issues/62629>.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp  | 18 ++++++++++--------
 llvm/lib/Target/AMDGPU/DSInstructions.td       |  8 ++++----
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp  | 17 ++++-------------
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h    |  1 -
 llvm/lib/Target/AMDGPU/SIInstrInfo.td          |  4 ++--
 5 files changed, 20 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b7471bab12850..e36ec9595e85a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1270,8 +1270,8 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
     // (add n0, c0)
     if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
       Base = N0;
-      Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
-      Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
+      Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
+      Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
       return true;
     }
   } else if (Addr.getOpcode() == ISD::SUB) {
@@ -1306,8 +1306,10 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
               SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
 
           Base = SDValue(MachineSub, 0);
-          Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
-          Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
+          Offset0 =
+              CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
+          Offset1 =
+              CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
           return true;
         }
       }
@@ -1321,8 +1323,8 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
       MachineSDNode *MovZero =
           CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
       Base = SDValue(MovZero, 0);
-      Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
-      Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
+      Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
+      Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
       return true;
     }
   }
@@ -1330,8 +1332,8 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
   // default case
 
   Base = Addr;
-  Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
-  Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
+  Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 219246b71fe80..51da1b2b2b479 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -852,24 +852,24 @@ def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_hi16_local>;
 }
 
 class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
-  (vt:$value (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
+  (vt:$value (frag (DS64Bit4ByteAligned i32:$ptr, i32:$offset0, i32:$offset1))),
   (inst $ptr, $offset0, $offset1, (i1 0))
 >;
 
 class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat<
-  (frag vt:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
+  (frag vt:$value, (DS64Bit4ByteAligned i32:$ptr, i32:$offset0, i32:$offset1)),
   (inst $ptr, (i32 (EXTRACT_SUBREG VReg_64:$value, sub0)),
               (i32 (EXTRACT_SUBREG VReg_64:$value, sub1)), $offset0, $offset1,
               (i1 0))
 >;
 
 class DS128Bit8ByteAlignedReadPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
-  (vt:$value (frag (DS128Bit8ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
+  (vt:$value (frag (DS128Bit8ByteAligned i32:$ptr, i32:$offset0, i32:$offset1))),
   (inst $ptr, $offset0, $offset1, (i1 0))
 >;
 
 class DS128Bit8ByteAlignedWritePat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat<
-  (frag vt:$value, (DS128Bit8ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
+  (frag vt:$value, (DS128Bit8ByteAligned i32:$ptr, i32:$offset0, i32:$offset1)),
   (inst $ptr, (i64 (EXTRACT_SUBREG VReg_128:$value, sub0_sub1)),
               (i64 (EXTRACT_SUBREG VReg_128:$value, sub2_sub3)), $offset0, $offset1,
               (i1 0))
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 37bb9675d8c1d..983f3c430f7be 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -76,11 +76,6 @@ void AMDGPUInstPrinter::printU4ImmDecOperand(const MCInst *MI, unsigned OpNo,
   O << formatDec(MI->getOperand(OpNo).getImm() & 0xf);
 }
 
-void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) {
-  O << formatDec(MI->getOperand(OpNo).getImm() & 0xff);
-}
-
 void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
                                               raw_ostream &O) {
   O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
@@ -138,19 +133,15 @@ void AMDGPUInstPrinter::printFlatOffset(const MCInst *MI, unsigned OpNo,
 void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm()) {
-    O << " offset0:";
-    printU8ImmDecOperand(MI, OpNo, O);
-  }
+  if (int64_t Offset = MI->getOperand(OpNo).getImm())
+    O << " offset0:" << formatDec(Offset);
 }
 
 void AMDGPUInstPrinter::printOffset1(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm()) {
-    O << " offset1:";
-    printU8ImmDecOperand(MI, OpNo, O);
-  }
+  if (int64_t Offset = MI->getOperand(OpNo).getImm())
+    O << " offset1:" << formatDec(Offset);
 }
 
 void AMDGPUInstPrinter::printSMRDOffset8(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index d6d7fd34b68cc..3236816f69bc5 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -41,7 +41,6 @@ class AMDGPUInstPrinter : public MCInstPrinter {
   void printU16ImmOperand(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
   void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU32ImmOperand(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 873e42aa0fed3..00a8b159c9526 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1067,8 +1067,8 @@ let ImmTy = "ImmTyOffset" in
 def flat_offset : CustomOperand<i32, 1, "FlatOffset">;
 def Offset : NamedIntOperand<i32, "offset">;
 let Validator = "isUInt<8>" in {
-def Offset0 : NamedIntOperand<i8, "offset0">;
-def Offset1 : NamedIntOperand<i8, "offset1">;
+def Offset0 : NamedIntOperand<i32, "offset0">;
+def Offset1 : NamedIntOperand<i32, "offset1">;
 }
 
 def gds : NamedBitOperand<"gds", "GDS">;

>From de792d6d92c3d4fde7065f2590133acf81f1a93e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 25 Jul 2024 13:53:30 +0100
Subject: [PATCH 139/259] [SLP][X86] Update add/sub sat test checks to share
 common SSE prefixes

---
 .../SLPVectorizer/X86/arith-add-ssat.ll       | 99 +------------------
 .../SLPVectorizer/X86/arith-add-usat.ll       |  7 +-
 .../SLPVectorizer/X86/arith-sub-ssat.ll       | 99 +------------------
 .../SLPVectorizer/X86/arith-sub-usat.ll       |  7 +-
 4 files changed, 20 insertions(+), 192 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
index 8c4903dbc92bb..09c585200a598 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
@@ -61,41 +61,6 @@ define void @add_v8i64() {
 ; SSE-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
 ; SSE-NEXT:    ret void
 ;
-; SLM-LABEL: @add_v8i64(
-; SLM-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
-; SLM-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
-; SLM-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]])
-; SLM-NEXT:    [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]])
-; SLM-NEXT:    [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]])
-; SLM-NEXT:    [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]])
-; SLM-NEXT:    [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]])
-; SLM-NEXT:    [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]])
-; SLM-NEXT:    [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]])
-; SLM-NEXT:    [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]])
-; SLM-NEXT:    store i64 [[R0]], ptr @c64, align 8
-; SLM-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SLM-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SLM-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SLM-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SLM-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SLM-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SLM-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
-; SLM-NEXT:    ret void
-;
 ; AVX1-LABEL: @add_v8i64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8
 ; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8
@@ -198,25 +163,6 @@ define void @add_v16i32() {
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
 ; SSE-NEXT:    ret void
 ;
-; SLM-LABEL: @add_v16i32(
-; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
-; SLM-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SLM-NEXT:    store <4 x i32> [[TMP3]], ptr @c32, align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
-; SLM-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
-; SLM-NEXT:    store <4 x i32> [[TMP9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
-; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
-; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
-; SLM-NEXT:    store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
-; SLM-NEXT:    ret void
-;
 ; AVX-LABEL: @add_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
@@ -322,25 +268,6 @@ define void @add_v32i16() {
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
 ; SSE-NEXT:    ret void
 ;
-; SLM-LABEL: @add_v32i16(
-; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
-; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SLM-NEXT:    store <8 x i16> [[TMP3]], ptr @c16, align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SLM-NEXT:    store <8 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SLM-NEXT:    store <8 x i16> [[TMP9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
-; SLM-NEXT:    store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
-; SLM-NEXT:    ret void
-;
 ; AVX-LABEL: @add_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
 ; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
@@ -510,25 +437,6 @@ define void @add_v64i8() {
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
 ; SSE-NEXT:    ret void
 ;
-; SLM-LABEL: @add_v64i8(
-; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
-; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SLM-NEXT:    store <16 x i8> [[TMP3]], ptr @c8, align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-; SLM-NEXT:    store <16 x i8> [[TMP6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-; SLM-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
-; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SLM-NEXT:    store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
-; SLM-NEXT:    ret void
-;
 ; AVX-LABEL: @add_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
 ; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
@@ -805,3 +713,6 @@ define void @add_v64i8() {
   store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SLM: {{.*}}
+; SSE2: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
index cb8d45b1a21a2..93cce8b563ba5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
@@ -684,3 +684,6 @@ define void @add_v64i8() {
   store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SLM: {{.*}}
+; SSE2: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
index 65e2a011cc9a1..478751e6a2575 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
@@ -61,41 +61,6 @@ define void @sub_v8i64() {
 ; SSE-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
 ; SSE-NEXT:    ret void
 ;
-; SLM-LABEL: @sub_v8i64(
-; SLM-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
-; SLM-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
-; SLM-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[R0:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A0]], i64 [[B0]])
-; SLM-NEXT:    [[R1:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A1]], i64 [[B1]])
-; SLM-NEXT:    [[R2:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A2]], i64 [[B2]])
-; SLM-NEXT:    [[R3:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A3]], i64 [[B3]])
-; SLM-NEXT:    [[R4:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A4]], i64 [[B4]])
-; SLM-NEXT:    [[R5:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A5]], i64 [[B5]])
-; SLM-NEXT:    [[R6:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A6]], i64 [[B6]])
-; SLM-NEXT:    [[R7:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A7]], i64 [[B7]])
-; SLM-NEXT:    store i64 [[R0]], ptr @c64, align 8
-; SLM-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SLM-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SLM-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SLM-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SLM-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SLM-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SLM-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
-; SLM-NEXT:    ret void
-;
 ; AVX1-LABEL: @sub_v8i64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8
 ; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8
@@ -198,25 +163,6 @@ define void @sub_v16i32() {
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
 ; SSE-NEXT:    ret void
 ;
-; SLM-LABEL: @sub_v16i32(
-; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
-; SLM-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SLM-NEXT:    store <4 x i32> [[TMP3]], ptr @c32, align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
-; SLM-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
-; SLM-NEXT:    store <4 x i32> [[TMP9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
-; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
-; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
-; SLM-NEXT:    store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
-; SLM-NEXT:    ret void
-;
 ; AVX-LABEL: @sub_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
@@ -322,25 +268,6 @@ define void @sub_v32i16() {
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
 ; SSE-NEXT:    ret void
 ;
-; SLM-LABEL: @sub_v32i16(
-; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
-; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SLM-NEXT:    store <8 x i16> [[TMP3]], ptr @c16, align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SLM-NEXT:    store <8 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SLM-NEXT:    store <8 x i16> [[TMP9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
-; SLM-NEXT:    store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
-; SLM-NEXT:    ret void
-;
 ; AVX-LABEL: @sub_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
 ; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
@@ -510,25 +437,6 @@ define void @sub_v64i8() {
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
 ; SSE-NEXT:    ret void
 ;
-; SLM-LABEL: @sub_v64i8(
-; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
-; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SLM-NEXT:    store <16 x i8> [[TMP3]], ptr @c8, align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-; SLM-NEXT:    store <16 x i8> [[TMP6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-; SLM-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
-; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SLM-NEXT:    store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
-; SLM-NEXT:    ret void
-;
 ; AVX-LABEL: @sub_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
 ; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
@@ -805,3 +713,6 @@ define void @sub_v64i8() {
   store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SLM: {{.*}}
+; SSE2: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
index 18df499c6646e..bfe989be8ca10 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
@@ -684,3 +684,6 @@ define void @sub_v64i8() {
   store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SLM: {{.*}}
+; SSE2: {{.*}}

>From b2b68c241a61bbbe0e2f89cb2182f1b1bec564b6 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 25 Jul 2024 14:25:10 +0100
Subject: [PATCH 140/259] [CostModel][X86] Add add/sat sat intrinsic costs

Fixes regressions from #97463 due to missing costs for custom lowered ops
---
 .../lib/Target/X86/X86TargetTransformInfo.cpp | 184 +++--
 .../CostModel/X86/arith-ssat-codesize.ll      | 578 +++++++--------
 .../CostModel/X86/arith-ssat-latency.ll       | 700 +++++++++---------
 .../CostModel/X86/arith-ssat-sizelatency.ll   | 656 ++++++++--------
 .../test/Analysis/CostModel/X86/arith-ssat.ll | 318 ++++----
 .../CostModel/X86/arith-usat-codesize.ll      | 696 ++++++++---------
 .../CostModel/X86/arith-usat-latency.ll       | 504 ++++++-------
 .../CostModel/X86/arith-usat-sizelatency.ll   | 684 ++++++++---------
 .../test/Analysis/CostModel/X86/arith-usat.ll | 312 ++++----
 .../SLPVectorizer/X86/arith-add-ssat.ll       | 128 ++--
 .../SLPVectorizer/X86/arith-add-usat.ll       |  90 ++-
 .../SLPVectorizer/X86/arith-sub-ssat.ll       |  41 +-
 .../SLPVectorizer/X86/arith-sub-usat.ll       |  90 ++-
 13 files changed, 2532 insertions(+), 2449 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index dc3ac80bdf5cf..ec8af2bc4c118 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3590,22 +3590,22 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { X86ISD::VROTLI,  MVT::v64i8,   {  2,  9,  3,  4 } },
     { X86ISD::VROTLI,  MVT::v32i8,   {  1,  9,  3,  4 } },
     { X86ISD::VROTLI,  MVT::v16i8,   {  1,  8,  3,  4 } },
-    { ISD::SADDSAT,    MVT::v32i16,  {  1 } },
-    { ISD::SADDSAT,    MVT::v64i8,   {  1 } },
+    { ISD::SADDSAT,    MVT::v32i16,  {  1,  1,  1,  1 } },
+    { ISD::SADDSAT,    MVT::v64i8,   {  1,  1,  1,  1 } },
     { ISD::SMAX,       MVT::v32i16,  {  1,  1,  1,  1 } },
     { ISD::SMAX,       MVT::v64i8,   {  1,  1,  1,  1 } },
     { ISD::SMIN,       MVT::v32i16,  {  1,  1,  1,  1 } },
     { ISD::SMIN,       MVT::v64i8,   {  1,  1,  1,  1 } },
-    { ISD::SSUBSAT,    MVT::v32i16,  {  1 } },
-    { ISD::SSUBSAT,    MVT::v64i8,   {  1 } },
-    { ISD::UADDSAT,    MVT::v32i16,  {  1 } },
-    { ISD::UADDSAT,    MVT::v64i8,   {  1 } },
+    { ISD::SSUBSAT,    MVT::v32i16,  {  1,  1,  1,  1 } },
+    { ISD::SSUBSAT,    MVT::v64i8,   {  1,  1,  1,  1 } },
+    { ISD::UADDSAT,    MVT::v32i16,  {  1,  1,  1,  1 } },
+    { ISD::UADDSAT,    MVT::v64i8,   {  1,  1,  1,  1 } },
     { ISD::UMAX,       MVT::v32i16,  {  1,  1,  1,  1 } },
     { ISD::UMAX,       MVT::v64i8,   {  1,  1,  1,  1 } },
     { ISD::UMIN,       MVT::v32i16,  {  1,  1,  1,  1 } },
     { ISD::UMIN,       MVT::v64i8,   {  1,  1,  1,  1 } },
-    { ISD::USUBSAT,    MVT::v32i16,  {  1 } },
-    { ISD::USUBSAT,    MVT::v64i8,   {  1 } },
+    { ISD::USUBSAT,    MVT::v32i16,  {  1,  1,  1,  1 } },
+    { ISD::USUBSAT,    MVT::v64i8,   {  1,  1,  1,  1 } },
   };
   static const CostKindTblEntry AVX512CostTbl[] = {
     { ISD::ABS,        MVT::v8i64,   {  1,  1,  1,  1 } },
@@ -3654,6 +3654,14 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { X86ISD::VROTLI,  MVT::v16i32,  {  1,  1,  1,  1 } },
     { X86ISD::VROTLI,  MVT::v8i32,   {  1,  1,  1,  1 } },
     { X86ISD::VROTLI,  MVT::v4i32,   {  1,  1,  1,  1 } },
+    { ISD::SADDSAT,    MVT::v2i64,   {  3,  3,  8,  9 } },
+    { ISD::SADDSAT,    MVT::v4i64,   {  2,  2,  6,  7 } },
+    { ISD::SADDSAT,    MVT::v8i64,   {  3,  3,  6,  7 } },
+    { ISD::SADDSAT,    MVT::v4i32,   {  2,  2,  6,  7 } },
+    { ISD::SADDSAT,    MVT::v8i32,   {  2,  2,  6,  7 } },
+    { ISD::SADDSAT,    MVT::v16i32,  {  3,  3,  6,  7 } },
+    { ISD::SADDSAT,    MVT::v32i16,  {  2,  2,  2,  2 } },
+    { ISD::SADDSAT,    MVT::v64i8,   {  2,  2,  2,  2 } },
     { ISD::SMAX,       MVT::v8i64,   {  1,  3,  1,  1 } },
     { ISD::SMAX,       MVT::v16i32,  {  1,  1,  1,  1 } },
     { ISD::SMAX,       MVT::v32i16,  {  3,  7,  5,  5 } },
@@ -3666,6 +3674,14 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::SMIN,       MVT::v64i8,   {  3,  7,  5,  5 } },
     { ISD::SMIN,       MVT::v4i64,   {  1,  3,  1,  1 } },
     { ISD::SMIN,       MVT::v2i64,   {  1,  3,  1,  1 } },
+    { ISD::SSUBSAT,    MVT::v2i64,   {  2, 13,  9, 10 } },
+    { ISD::SSUBSAT,    MVT::v4i64,   {  2, 15,  7,  8} },
+    { ISD::SSUBSAT,    MVT::v8i64,   {  2, 14,  7,  8 } },
+    { ISD::SSUBSAT,    MVT::v4i32,   {  2, 14,  7,  8 } },
+    { ISD::SSUBSAT,    MVT::v8i32,   {  2, 15,  7,  8 } },
+    { ISD::SSUBSAT,    MVT::v16i32,  {  2, 14,  7,  8 } },
+    { ISD::SSUBSAT,    MVT::v32i16,  {  2,  2,  2,  2 } },
+    { ISD::SSUBSAT,    MVT::v64i8,   {  2,  2,  2,  2 } },
     { ISD::UMAX,       MVT::v8i64,   {  1,  3,  1,  1 } },
     { ISD::UMAX,       MVT::v16i32,  {  1,  1,  1,  1 } },
     { ISD::UMAX,       MVT::v32i16,  {  3,  7,  5,  5 } },
@@ -3678,22 +3694,21 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::UMIN,       MVT::v64i8,   {  3,  7,  5,  5 } },
     { ISD::UMIN,       MVT::v4i64,   {  1,  3,  1,  1 } },
     { ISD::UMIN,       MVT::v2i64,   {  1,  3,  1,  1 } },
-    { ISD::USUBSAT,    MVT::v16i32,  {  2 } }, // pmaxud + psubd
-    { ISD::USUBSAT,    MVT::v2i64,   {  2 } }, // pmaxuq + psubq
-    { ISD::USUBSAT,    MVT::v4i64,   {  2 } }, // pmaxuq + psubq
-    { ISD::USUBSAT,    MVT::v8i64,   {  2 } }, // pmaxuq + psubq
-    { ISD::UADDSAT,    MVT::v16i32,  {  3 } }, // not + pminud + paddd
-    { ISD::UADDSAT,    MVT::v2i64,   {  3 } }, // not + pminuq + paddq
-    { ISD::UADDSAT,    MVT::v4i64,   {  3 } }, // not + pminuq + paddq
-    { ISD::UADDSAT,    MVT::v8i64,   {  3 } }, // not + pminuq + paddq
-    { ISD::SADDSAT,    MVT::v32i16,  {  2 } },
-    { ISD::SADDSAT,    MVT::v64i8,   {  2 } },
-    { ISD::SSUBSAT,    MVT::v32i16,  {  2 } },
-    { ISD::SSUBSAT,    MVT::v64i8,   {  2 } },
-    { ISD::UADDSAT,    MVT::v32i16,  {  2 } },
-    { ISD::UADDSAT,    MVT::v64i8,   {  2 } },
-    { ISD::USUBSAT,    MVT::v32i16,  {  2 } },
-    { ISD::USUBSAT,    MVT::v64i8,   {  2 } },
+    { ISD::UADDSAT,    MVT::v2i64,   {  1,  4,  4,  4 } },
+    { ISD::UADDSAT,    MVT::v4i64,   {  1,  4,  4,  4 } },
+    { ISD::UADDSAT,    MVT::v8i64,   {  1,  4,  4,  4 } },
+    { ISD::UADDSAT,    MVT::v4i32,   {  1,  2,  4,  4 } },
+    { ISD::UADDSAT,    MVT::v8i32,   {  1,  2,  4,  4 } },
+    { ISD::UADDSAT,    MVT::v16i32,  {  2,  2,  4,  4 } },
+    { ISD::UADDSAT,    MVT::v32i16,  {  2,  2,  2,  2 } },
+    { ISD::UADDSAT,    MVT::v64i8,   {  2,  2,  2,  2 } },
+    { ISD::USUBSAT,    MVT::v2i64,   {  1,  4,  2,  2 } },
+    { ISD::USUBSAT,    MVT::v4i64,   {  1,  4,  2,  2 } },
+    { ISD::USUBSAT,    MVT::v8i64,   {  1,  4,  2,  2 } },
+    { ISD::USUBSAT,    MVT::v8i32,   {  1,  2,  2,  2 } },
+    { ISD::USUBSAT,    MVT::v16i32,  {  1,  2,  2,  2 } },
+    { ISD::USUBSAT,    MVT::v32i16,  {  2,  2,  2,  2 } },
+    { ISD::USUBSAT,    MVT::v64i8,   {  2,  2,  2,  2 } },
     { ISD::FMAXNUM,    MVT::f32,     {  2,  2,  3,  3 } },
     { ISD::FMAXNUM,    MVT::v4f32,   {  1,  1,  3,  3 } },
     { ISD::FMAXNUM,    MVT::v8f32,   {  2,  2,  3,  3 } },
@@ -3797,8 +3812,12 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::CTTZ,       MVT::v16i16,  {  6,  9, 14, 24 } },
     { ISD::CTTZ,       MVT::v16i8,   {  3,  7, 11, 11 } },
     { ISD::CTTZ,       MVT::v32i8,   {  5,  7, 11, 18 } },
-    { ISD::SADDSAT,    MVT::v16i16,  {  1 } },
-    { ISD::SADDSAT,    MVT::v32i8,   {  1 } },
+    { ISD::SADDSAT,    MVT::v2i64,   {  4, 13,  8, 11 } },
+    { ISD::SADDSAT,    MVT::v4i64,   {  3, 10,  8, 12 } },
+    { ISD::SADDSAT,    MVT::v4i32,   {  2,  6,  7,  9 } },
+    { ISD::SADDSAT,    MVT::v8i32,   {  4,  6,  7, 13 } },
+    { ISD::SADDSAT,    MVT::v16i16,  {  1,  1,  1,  2 } },
+    { ISD::SADDSAT,    MVT::v32i8,   {  1,  1,  1,  2 } },
     { ISD::SMAX,       MVT::v2i64,   {  2,  7,  2,  3 } },
     { ISD::SMAX,       MVT::v4i64,   {  2,  7,  2,  3 } },
     { ISD::SMAX,       MVT::v8i32,   {  1,  1,  1,  2 } },
@@ -3809,11 +3828,17 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::SMIN,       MVT::v8i32,   {  1,  1,  1,  2 } },
     { ISD::SMIN,       MVT::v16i16,  {  1,  1,  1,  2 } },
     { ISD::SMIN,       MVT::v32i8,   {  1,  1,  1,  2 } },
-    { ISD::SSUBSAT,    MVT::v16i16,  {  1 } },
-    { ISD::SSUBSAT,    MVT::v32i8,   {  1 } },
-    { ISD::UADDSAT,    MVT::v16i16,  {  1 } },
-    { ISD::UADDSAT,    MVT::v32i8,   {  1 } },
-    { ISD::UADDSAT,    MVT::v8i32,   {  3 } }, // not + pminud + paddd
+    { ISD::SSUBSAT,    MVT::v2i64,   {  4, 13,  9, 13 } },
+    { ISD::SSUBSAT,    MVT::v4i64,   {  4, 15,  9, 13 } },
+    { ISD::SSUBSAT,    MVT::v4i32,   {  3, 14,  9, 11 } },
+    { ISD::SSUBSAT,    MVT::v8i32,   {  4, 15,  9, 16 } },
+    { ISD::SSUBSAT,    MVT::v16i16,  {  1,  1,  1,  2 } },
+    { ISD::SSUBSAT,    MVT::v32i8,   {  1,  1,  1,  2 } },
+    { ISD::UADDSAT,    MVT::v2i64,   {  2,  8,  6,  6 } },
+    { ISD::UADDSAT,    MVT::v4i64,   {  3,  8,  6, 10 } },
+    { ISD::UADDSAT,    MVT::v8i32,   {  2,  2,  4,  8 } },
+    { ISD::UADDSAT,    MVT::v16i16,  {  1,  1,  1,  2 } },
+    { ISD::UADDSAT,    MVT::v32i8,   {  1,  1,  1,  2 } },
     { ISD::UMAX,       MVT::v2i64,   {  2,  8,  5,  6 } },
     { ISD::UMAX,       MVT::v4i64,   {  2,  8,  5,  8 } },
     { ISD::UMAX,       MVT::v8i32,   {  1,  1,  1,  2 } },
@@ -3824,9 +3849,11 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::UMIN,       MVT::v8i32,   {  1,  1,  1,  2 } },
     { ISD::UMIN,       MVT::v16i16,  {  1,  1,  1,  2 } },
     { ISD::UMIN,       MVT::v32i8,   {  1,  1,  1,  2 } },
-    { ISD::USUBSAT,    MVT::v16i16,  {  1 } },
-    { ISD::USUBSAT,    MVT::v32i8,   {  1 } },
-    { ISD::USUBSAT,    MVT::v8i32,   {  2 } }, // pmaxud + psubd
+    { ISD::USUBSAT,    MVT::v2i64,   {  2,  7,  6,  6 } },
+    { ISD::USUBSAT,    MVT::v4i64,   {  3,  7,  6, 10 } },
+    { ISD::USUBSAT,    MVT::v8i32,   {  2,  2,  2,  4 } },
+    { ISD::USUBSAT,    MVT::v16i16,  {  1,  1,  1,  2 } },
+    { ISD::USUBSAT,    MVT::v32i8,   {  1,  1,  1,  2 } },
     { ISD::FMAXNUM,    MVT::f32,     {  2,  7,  3,  5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
     { ISD::FMAXNUM,    MVT::v4f32,   {  2,  7,  3,  5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
     { ISD::FMAXNUM,    MVT::v8f32,   {  3,  7,  3,  6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
@@ -3883,8 +3910,11 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::CTTZ,       MVT::v8i16,   {  9, 21, 14, 18 } },
     { ISD::CTTZ,       MVT::v32i8,   { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
     { ISD::CTTZ,       MVT::v16i8,   {  8, 16, 11, 15 } },
-    { ISD::SADDSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::SADDSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::SADDSAT,    MVT::v2i64,   {  6, 13,  8, 11 } },
+    { ISD::SADDSAT,    MVT::v4i64,   { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::SADDSAT,    MVT::v8i32,   { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::SADDSAT,    MVT::v16i16,  {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::SADDSAT,    MVT::v32i8,   {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
     { ISD::SMAX,       MVT::v4i64,   {  6,  9,  6, 12 } }, // 2 x 128-bit Op + extract/insert
     { ISD::SMAX,       MVT::v2i64,   {  3,  7,  2,  4 } },
     { ISD::SMAX,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
@@ -3895,11 +3925,16 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::SMIN,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
     { ISD::SMIN,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
     { ISD::SMIN,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::SSUBSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::SSUBSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::UADDSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::UADDSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::UADDSAT,    MVT::v8i32,   {  8 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::SSUBSAT,    MVT::v2i64,   {  7, 13,  9, 13 } },
+    { ISD::SSUBSAT,    MVT::v4i64,   { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::SSUBSAT,    MVT::v8i32,   { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::SSUBSAT,    MVT::v16i16,  {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::SSUBSAT,    MVT::v32i8,   {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::UADDSAT,    MVT::v2i64,   {  3,  8,  6,  6 } },
+    { ISD::UADDSAT,    MVT::v4i64,   {  8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::UADDSAT,    MVT::v8i32,   {  6,  6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::UADDSAT,    MVT::v16i16,  {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::UADDSAT,    MVT::v32i8,   {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
     { ISD::UMAX,       MVT::v4i64,   {  9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
     { ISD::UMAX,       MVT::v2i64,   {  4,  8,  5,  7 } },
     { ISD::UMAX,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
@@ -3910,9 +3945,12 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::UMIN,       MVT::v8i32,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
     { ISD::UMIN,       MVT::v16i16,  {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
     { ISD::UMIN,       MVT::v32i8,   {  4,  6,  5,  6 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::USUBSAT,    MVT::v16i16,  {  4 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::USUBSAT,    MVT::v32i8,   {  4 } }, // 2 x 128-bit Op + extract/insert
-    { ISD::USUBSAT,    MVT::v8i32,   {  6 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::USUBSAT,    MVT::v2i64,   {  3,  7,  6,  6 } },
+    { ISD::USUBSAT,    MVT::v4i64,   {  8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::USUBSAT,    MVT::v8i32,   {  4,  4,  7,  8 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::USUBSAT,    MVT::v8i32,   {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::USUBSAT,    MVT::v16i16,  {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
+    { ISD::USUBSAT,    MVT::v32i8,   {  3,  3,  5,  6 } }, // 2 x 128-bit Op + extract/insert
     { ISD::FMAXNUM,    MVT::f32,     {  3,  6,  3,  5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
     { ISD::FMAXNUM,    MVT::v4f32,   {  3,  6,  3,  5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
     { ISD::FMAXNUM,    MVT::v8f32,   {  5,  7,  3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
@@ -3963,8 +4001,6 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::FSQRT,      MVT::v2f64,   { 70, 71, 1, 5 } }, // sqrtpd
   };
   static const CostKindTblEntry SSE42CostTbl[] = {
-    { ISD::USUBSAT,    MVT::v4i32,   {  2 } }, // pmaxud + psubd
-    { ISD::UADDSAT,    MVT::v4i32,   {  3 } }, // not + pminud + paddd
     { ISD::FMAXNUM,    MVT::f32,     {  5,  5,  7,  7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
     { ISD::FMAXNUM,    MVT::v4f32,   {  4,  4,  4,  5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
     { ISD::FMAXNUM,    MVT::f64,     {  5,  5,  7,  7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
@@ -3974,12 +4010,20 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   };
   static const CostKindTblEntry SSE41CostTbl[] = {
     { ISD::ABS,        MVT::v2i64,   {  3,  4,  3,  5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
+    { ISD::SADDSAT,    MVT::v2i64,   { 10, 14, 17, 21 } },
+    { ISD::SADDSAT,    MVT::v4i32,   {  5, 11,  8, 10 } },
+    { ISD::SSUBSAT,    MVT::v2i64,   { 12, 19, 25, 29 } },
+    { ISD::SSUBSAT,    MVT::v4i32,   {  6, 14, 10, 12 } },
     { ISD::SMAX,       MVT::v2i64,   {  3,  7,  2,  3 } },
     { ISD::SMAX,       MVT::v4i32,   {  1,  1,  1,  1 } },
     { ISD::SMAX,       MVT::v16i8,   {  1,  1,  1,  1 } },
     { ISD::SMIN,       MVT::v2i64,   {  3,  7,  2,  3 } },
     { ISD::SMIN,       MVT::v4i32,   {  1,  1,  1,  1 } },
     { ISD::SMIN,       MVT::v16i8,   {  1,  1,  1,  1 } },
+    { ISD::UADDSAT,    MVT::v2i64,   {  6, 13, 14, 14 } },
+    { ISD::UADDSAT,    MVT::v4i32,   {  2,  2,  4,  4 } },
+    { ISD::USUBSAT,    MVT::v2i64,   {  6, 10, 14, 14 } },
+    { ISD::USUBSAT,    MVT::v4i32,   {  1,  2,  2,  2 } },
     { ISD::UMAX,       MVT::v2i64,   {  2, 11,  6,  7 } },
     { ISD::UMAX,       MVT::v4i32,   {  1,  1,  1,  1 } },
     { ISD::UMAX,       MVT::v8i16,   {  1,  1,  1,  1 } },
@@ -4035,8 +4079,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::CTTZ,       MVT::v4i32,   { 18, 31, 24, 26 } },
     { ISD::CTTZ,       MVT::v8i16,   { 16, 27, 21, 23 } },
     { ISD::CTTZ,       MVT::v16i8,   { 13, 23, 17, 19 } },
-    { ISD::SADDSAT,    MVT::v8i16,   {  1 } },
-    { ISD::SADDSAT,    MVT::v16i8,   {  1 } },
+    { ISD::SADDSAT,    MVT::v2i64,   { 12, 14, 24, 24 } },
+    { ISD::SADDSAT,    MVT::v4i32,   {  6, 11, 11, 12 } },
+    { ISD::SADDSAT,    MVT::v8i16,   {  1,  2,  1,  1 } },
+    { ISD::SADDSAT,    MVT::v16i8,   {  1,  2,  1,  1 } },
     { ISD::SMAX,       MVT::v2i64,   {  4,  8, 15, 15 } },
     { ISD::SMAX,       MVT::v4i32,   {  2,  4,  5,  5 } },
     { ISD::SMAX,       MVT::v8i16,   {  1,  1,  1,  1 } },
@@ -4045,10 +4091,14 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::SMIN,       MVT::v4i32,   {  2,  4,  5,  5 } },
     { ISD::SMIN,       MVT::v8i16,   {  1,  1,  1,  1 } },
     { ISD::SMIN,       MVT::v16i8,   {  2,  4,  5,  5 } },
-    { ISD::SSUBSAT,    MVT::v8i16,   {  1 } },
-    { ISD::SSUBSAT,    MVT::v16i8,   {  1 } },
-    { ISD::UADDSAT,    MVT::v8i16,   {  1 } },
-    { ISD::UADDSAT,    MVT::v16i8,   {  1 } },
+    { ISD::SSUBSAT,    MVT::v2i64,   { 16, 19, 31, 31 } },
+    { ISD::SSUBSAT,    MVT::v4i32,   {  6, 14, 12, 13 } },
+    { ISD::SSUBSAT,    MVT::v8i16,   {  1,  2,  1,  1 } },
+    { ISD::SSUBSAT,    MVT::v16i8,   {  1,  2,  1,  1 } },
+    { ISD::UADDSAT,    MVT::v2i64,   {  7, 13, 14, 14 } },
+    { ISD::UADDSAT,    MVT::v4i32,   {  4,  5,  7,  7 } },
+    { ISD::UADDSAT,    MVT::v8i16,   {  1,  2,  1,  1 } },
+    { ISD::UADDSAT,    MVT::v16i8,   {  1,  2,  1,  1 } },
     { ISD::UMAX,       MVT::v2i64,   {  4,  8, 15, 15 } },
     { ISD::UMAX,       MVT::v4i32,   {  2,  5,  8,  8 } },
     { ISD::UMAX,       MVT::v8i16,   {  1,  3,  3,  3 } },
@@ -4057,8 +4107,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::UMIN,       MVT::v4i32,   {  2,  5,  8,  8 } },
     { ISD::UMIN,       MVT::v8i16,   {  1,  3,  3,  3 } },
     { ISD::UMIN,       MVT::v16i8,   {  1,  1,  1,  1 } },
-    { ISD::USUBSAT,    MVT::v8i16,   {  1 } },
-    { ISD::USUBSAT,    MVT::v16i8,   {  1 } },
+    { ISD::USUBSAT,    MVT::v2i64,   {  7, 10, 14, 14 } },
+    { ISD::USUBSAT,    MVT::v4i32,   {  4,  4,  7,  7 } },
+    { ISD::USUBSAT,    MVT::v8i16,   {  1,  2,  1,  1 } },
+    { ISD::USUBSAT,    MVT::v16i8,   {  1,  2,  1,  1 } },
     { ISD::FMAXNUM,    MVT::f64,     {  5,  5,  7,  7 } },
     { ISD::FMAXNUM,    MVT::v2f64,   {  4,  6,  6,  6 } },
     { ISD::FSQRT,      MVT::f64,     { 32, 32,  1,  1 } }, // Nehalem from http://www.agner.org/
@@ -4103,10 +4155,14 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::CTTZ,       MVT::i64,     {  3 } }, // TEST+BSF+CMOV/BRANCH
     { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{  1,  1,  1,  1 } }, // BSR
     { ISD::CTPOP,      MVT::i64,     { 10,  6, 19, 19 } },
-    { ISD::ROTL,       MVT::i64,     {  2, 3, 1, 3 } },
-    { ISD::ROTR,       MVT::i64,     {  2, 3, 1, 3 } },
-    { X86ISD::VROTLI,  MVT::i64,     {  1, 1, 1, 1 } },
-    { ISD::FSHL,       MVT::i64,     {  4, 4, 1, 4 } },
+    { ISD::ROTL,       MVT::i64,     {  2,  3,  1,  3 } },
+    { ISD::ROTR,       MVT::i64,     {  2,  3,  1,  3 } },
+    { X86ISD::VROTLI,  MVT::i64,     {  1,  1,  1,  1 } },
+    { ISD::FSHL,       MVT::i64,     {  4,  4,  1,  4 } },
+    { ISD::SADDSAT,    MVT::i64,     {  4,  4,  7, 10 } },
+    { ISD::SSUBSAT,    MVT::i64,     {  4,  5,  8, 11 } },
+    { ISD::UADDSAT,    MVT::i64,     {  2,  3,  4,  7 } },
+    { ISD::USUBSAT,    MVT::i64,     {  2,  3,  4,  7 } },
     { ISD::SMAX,       MVT::i64,     {  1,  3,  2,  3 } },
     { ISD::SMIN,       MVT::i64,     {  1,  3,  2,  3 } },
     { ISD::UMAX,       MVT::i64,     {  1,  3,  2,  3 } },
@@ -4151,6 +4207,18 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::FSHL,       MVT::i32,     {  4,  4,  1,  4 } },
     { ISD::FSHL,       MVT::i16,     {  4,  4,  2,  5 } },
     { ISD::FSHL,       MVT::i8,      {  4,  4,  2,  5 } },
+    { ISD::SADDSAT,    MVT::i32,     {  3,  4,  6,  9 } },
+    { ISD::SADDSAT,    MVT::i16,     {  4,  4,  7, 10 } },
+    { ISD::SADDSAT,    MVT::i8,      {  4,  5,  8, 11 } },
+    { ISD::SSUBSAT,    MVT::i32,     {  4,  4,  7, 10 } },
+    { ISD::SSUBSAT,    MVT::i16,     {  4,  4,  7, 10 } },
+    { ISD::SSUBSAT,    MVT::i8,      {  4,  5,  8, 11 } },
+    { ISD::UADDSAT,    MVT::i32,     {  2,  3,  4,  7 } },
+    { ISD::UADDSAT,    MVT::i16,     {  2,  3,  4,  7 } },
+    { ISD::UADDSAT,    MVT::i8,      {  3,  3,  5,  8 } },
+    { ISD::USUBSAT,    MVT::i32,     {  2,  3,  4,  7 } },
+    { ISD::USUBSAT,    MVT::i16,     {  2,  3,  4,  7 } },
+    { ISD::USUBSAT,    MVT::i8,      {  3,  3,  5,  8 } },
     { ISD::SMAX,       MVT::i32,     {  1,  2,  2,  3 } },
     { ISD::SMAX,       MVT::i16,     {  1,  4,  2,  4 } },
     { ISD::SMAX,       MVT::i8,      {  1,  4,  2,  4 } },
diff --git a/llvm/test/Analysis/CostModel/X86/arith-ssat-codesize.ll b/llvm/test/Analysis/CostModel/X86/arith-ssat-codesize.ll
index 21c99555c5df3..e3ade3986398d 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-ssat-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-ssat-codesize.ll
@@ -35,211 +35,211 @@ declare <64 x i8>  @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @add(i32 %arg) {
 ; SSE2-LABEL: 'add'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'add'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'add'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'add'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'add'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'add'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'add'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'add'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'add'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'add'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'add'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
@@ -287,103 +287,103 @@ declare <64 x i8>  @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @sub(i32 %arg) {
 ; SSE2-LABEL: 'sub'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'sub'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'sub'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'sub'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sub'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'sub'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
@@ -391,18 +391,18 @@ define i32 @sub(i32 %arg) {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'sub'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
@@ -410,18 +410,18 @@ define i32 @sub(i32 %arg) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'sub'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
@@ -429,70 +429,70 @@ define i32 @sub(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'sub'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'sub'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'sub'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-ssat-latency.ll b/llvm/test/Analysis/CostModel/X86/arith-ssat-latency.ll
index 1d398d9555fbf..a08785189a4b8 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-ssat-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-ssat-latency.ll
@@ -34,212 +34,212 @@ declare <64 x i8>  @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @add(i32 %arg) {
 ; SSE2-LABEL: 'add'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'add'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'add'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'add'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'add'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'add'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'add'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'add'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'add'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'add'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'add'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
@@ -287,212 +287,212 @@ declare <64 x i8>  @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @sub(i32 %arg) {
 ; SSE2-LABEL: 'sub'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'sub'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'sub'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'sub'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sub'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'sub'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'sub'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'sub'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'sub'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'sub'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'sub'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-ssat-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/arith-ssat-sizelatency.ll
index 4d1fa7c5cf55c..070c70e1fe018 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-ssat-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-ssat-sizelatency.ll
@@ -34,212 +34,212 @@ declare <64 x i8>  @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @add(i32 %arg) {
 ; SSE2-LABEL: 'add'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'add'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'add'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'add'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'add'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'add'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'add'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'add'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'add'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'add'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'add'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
@@ -287,212 +287,212 @@ declare <64 x i8>  @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @sub(i32 %arg) {
 ; SSE2-LABEL: 'sub'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'sub'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'sub'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'sub'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sub'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'sub'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'sub'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'sub'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'sub'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'sub'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'sub'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-ssat.ll b/llvm/test/Analysis/CostModel/X86/arith-ssat.ll
index 4bbd28edb434b..e8c8666617b83 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-ssat.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-ssat.ll
@@ -35,13 +35,13 @@ declare <64 x i8>  @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @add(i32 %arg) {
 ; SSE2-LABEL: 'add'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -54,13 +54,13 @@ define i32 @add(i32 %arg) {
 ;
 ; SSSE3-LABEL: 'add'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -73,13 +73,13 @@ define i32 @add(i32 %arg) {
 ;
 ; SSE42-LABEL: 'add'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -92,32 +92,32 @@ define i32 @add(i32 %arg) {
 ;
 ; AVX1-LABEL: 'add'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'add'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -130,13 +130,13 @@ define i32 @add(i32 %arg) {
 ;
 ; AVX512F-LABEL: 'add'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -149,13 +149,13 @@ define i32 @add(i32 %arg) {
 ;
 ; AVX512BW-LABEL: 'add'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -168,13 +168,13 @@ define i32 @add(i32 %arg) {
 ;
 ; AVX512DQ-LABEL: 'add'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -187,13 +187,13 @@ define i32 @add(i32 %arg) {
 ;
 ; SLM-LABEL: 'add'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -206,13 +206,13 @@ define i32 @add(i32 %arg) {
 ;
 ; GLM-LABEL: 'add'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -225,21 +225,21 @@ define i32 @add(i32 %arg) {
 ;
 ; BTVER2-LABEL: 'add'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
@@ -288,13 +288,13 @@ declare <64 x i8>  @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @sub(i32 %arg) {
 ; SSE2-LABEL: 'sub'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -307,13 +307,13 @@ define i32 @sub(i32 %arg) {
 ;
 ; SSSE3-LABEL: 'sub'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -326,13 +326,13 @@ define i32 @sub(i32 %arg) {
 ;
 ; SSE42-LABEL: 'sub'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -345,32 +345,32 @@ define i32 @sub(i32 %arg) {
 ;
 ; AVX1-LABEL: 'sub'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sub'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -383,13 +383,13 @@ define i32 @sub(i32 %arg) {
 ;
 ; AVX512F-LABEL: 'sub'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -402,13 +402,13 @@ define i32 @sub(i32 %arg) {
 ;
 ; AVX512BW-LABEL: 'sub'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -421,13 +421,13 @@ define i32 @sub(i32 %arg) {
 ;
 ; AVX512DQ-LABEL: 'sub'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -440,13 +440,13 @@ define i32 @sub(i32 %arg) {
 ;
 ; SLM-LABEL: 'sub'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -459,13 +459,13 @@ define i32 @sub(i32 %arg) {
 ;
 ; GLM-LABEL: 'sub'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -478,21 +478,21 @@ define i32 @sub(i32 %arg) {
 ;
 ; BTVER2-LABEL: 'sub'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-usat-codesize.ll b/llvm/test/Analysis/CostModel/X86/arith-usat-codesize.ll
index e92b8102a5a3f..fcb03da681e2a 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-usat-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-usat-codesize.ll
@@ -34,212 +34,212 @@ declare <64 x i8>  @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @add(i32 %arg) {
 ; SSE2-LABEL: 'add'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'add'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'add'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'add'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'add'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'add'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'add'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'add'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'add'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'add'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'add'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
@@ -287,212 +287,212 @@ declare <64 x i8>  @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @sub(i32 %arg) {
 ; SSE2-LABEL: 'sub'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'sub'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'sub'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'sub'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sub'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'sub'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'sub'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'sub'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'sub'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'sub'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'sub'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-usat-latency.ll b/llvm/test/Analysis/CostModel/X86/arith-usat-latency.ll
index 9d9e5843c2285..b5a1321531f5c 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-usat-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-usat-latency.ll
@@ -35,154 +35,154 @@ declare <64 x i8>  @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @add(i32 %arg) {
 ; SSE2-LABEL: 'add'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'add'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'add'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'add'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'add'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'add'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'add'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'add'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'add'
@@ -191,55 +191,55 @@ define i32 @add(i32 %arg) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'add'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'add'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
@@ -292,17 +292,17 @@ define i32 @sub(i32 %arg) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'sub'
@@ -311,188 +311,188 @@ define i32 @sub(i32 %arg) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'sub'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'sub'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sub'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'sub'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'sub'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'sub'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'sub'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'sub'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'sub'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-usat-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/arith-usat-sizelatency.ll
index 1912822bdcb65..c4b0f7528cb1d 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-usat-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-usat-sizelatency.ll
@@ -34,212 +34,212 @@ declare <64 x i8>  @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @add(i32 %arg) {
 ; SSE2-LABEL: 'add'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'add'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'add'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'add'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'add'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'add'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'add'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'add'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'add'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'add'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'add'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
@@ -287,212 +287,212 @@ declare <64 x i8>  @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @sub(i32 %arg) {
 ; SSE2-LABEL: 'sub'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'sub'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'sub'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'sub'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sub'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'sub'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'sub'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'sub'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'sub'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'sub'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'sub'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-usat.ll b/llvm/test/Analysis/CostModel/X86/arith-usat.ll
index 75950e40c2a00..95dc22d4a1714 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-usat.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-usat.ll
@@ -35,18 +35,18 @@ declare <64 x i8>  @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @add(i32 %arg) {
 ; SSE2-LABEL: 'add'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -54,18 +54,18 @@ define i32 @add(i32 %arg) {
 ;
 ; SSSE3-LABEL: 'add'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -77,14 +77,14 @@ define i32 @add(i32 %arg) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -92,37 +92,37 @@ define i32 @add(i32 %arg) {
 ;
 ; AVX1-LABEL: 'add'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'add'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -130,18 +130,18 @@ define i32 @add(i32 %arg) {
 ;
 ; AVX512F-LABEL: 'add'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -149,18 +149,18 @@ define i32 @add(i32 %arg) {
 ;
 ; AVX512BW-LABEL: 'add'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -168,18 +168,18 @@ define i32 @add(i32 %arg) {
 ;
 ; AVX512DQ-LABEL: 'add'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -187,18 +187,18 @@ define i32 @add(i32 %arg) {
 ;
 ; SLM-LABEL: 'add'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -210,14 +210,14 @@ define i32 @add(i32 %arg) {
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -225,21 +225,21 @@ define i32 @add(i32 %arg) {
 ;
 ; BTVER2-LABEL: 'add'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
@@ -288,18 +288,18 @@ declare <64 x i8>  @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @sub(i32 %arg) {
 ; SSE2-LABEL: 'sub'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -307,18 +307,18 @@ define i32 @sub(i32 %arg) {
 ;
 ; SSSE3-LABEL: 'sub'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -330,14 +330,14 @@ define i32 @sub(i32 %arg) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -345,37 +345,37 @@ define i32 @sub(i32 %arg) {
 ;
 ; AVX1-LABEL: 'sub'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sub'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -383,18 +383,18 @@ define i32 @sub(i32 %arg) {
 ;
 ; AVX512F-LABEL: 'sub'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -402,18 +402,18 @@ define i32 @sub(i32 %arg) {
 ;
 ; AVX512BW-LABEL: 'sub'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -421,18 +421,18 @@ define i32 @sub(i32 %arg) {
 ;
 ; AVX512DQ-LABEL: 'sub'
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -440,18 +440,18 @@ define i32 @sub(i32 %arg) {
 ;
 ; SLM-LABEL: 'sub'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -463,14 +463,14 @@ define i32 @sub(i32 %arg) {
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -478,21 +478,21 @@ define i32 @sub(i32 %arg) {
 ;
 ; BTVER2-LABEL: 'sub'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
index 09c585200a598..634870e3c780a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
@@ -26,70 +26,70 @@ declare i16 @llvm.sadd.sat.i16(i16, i16)
 declare i8  @llvm.sadd.sat.i8 (i8 , i8 )
 
 define void @add_v8i64() {
-; SSE-LABEL: @add_v8i64(
-; SSE-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
-; SSE-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
-; SSE-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT:    [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT:    [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT:    [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT:    [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT:    [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT:    [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT:    [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT:    store i64 [[R0]], ptr @c64, align 8
-; SSE-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
-; SSE-NEXT:    ret void
+; SSE2-LABEL: @add_v8i64(
+; SSE2-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
+; SSE2-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; SSE2-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
+; SSE2-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
+; SSE2-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
+; SSE2-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; SSE2-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
+; SSE2-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
+; SSE2-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]])
+; SSE2-NEXT:    [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]])
+; SSE2-NEXT:    [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]])
+; SSE2-NEXT:    [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]])
+; SSE2-NEXT:    [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]])
+; SSE2-NEXT:    [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]])
+; SSE2-NEXT:    [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]])
+; SSE2-NEXT:    [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]])
+; SSE2-NEXT:    store i64 [[R0]], ptr @c64, align 8
+; SSE2-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
+; SSE2-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
+; SSE2-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
+; SSE2-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; SSE2-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
+; SSE2-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
+; SSE2-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
+; SSE2-NEXT:    ret void
 ;
-; AVX1-LABEL: @add_v8i64(
-; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
-; AVX1-NEXT:    store <2 x i64> [[TMP3]], ptr @c64, align 8
-; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; AVX1-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]])
-; AVX1-NEXT:    store <2 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
-; AVX1-NEXT:    store <2 x i64> [[TMP9]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX1-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; AVX1-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
-; AVX1-NEXT:    store <2 x i64> [[TMP12]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; AVX1-NEXT:    ret void
+; SLM-LABEL: @add_v8i64(
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8
+; SLM-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+; SLM-NEXT:    store <2 x i64> [[TMP3]], ptr @c64, align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    store <2 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
+; SLM-NEXT:    store <2 x i64> [[TMP12]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
+; SLM-NEXT:    ret void
 ;
-; AVX2-LABEL: @add_v8i64(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX2-NEXT:    store <4 x i64> [[TMP3]], ptr @c64, align 8
-; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX2-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
-; AVX2-NEXT:    store <4 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX2-NEXT:    ret void
+; AVX-LABEL: @add_v8i64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
+; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
+; AVX-NEXT:    store <4 x i64> [[TMP3]], ptr @c64, align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
+; AVX-NEXT:    store <4 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; AVX-NEXT:    ret void
 ;
 ; AVX512-LABEL: @add_v8i64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
@@ -714,5 +714,5 @@ define void @add_v64i8() {
   ret void
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; SLM: {{.*}}
-; SSE2: {{.*}}
+; AVX1: {{.*}}
+; AVX2: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
index 93cce8b563ba5..bd10ecdf330ec 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
@@ -26,40 +26,59 @@ declare i16 @llvm.uadd.sat.i16(i16, i16)
 declare i8  @llvm.uadd.sat.i8 (i8 , i8 )
 
 define void @add_v8i64() {
-; SSE-LABEL: @add_v8i64(
-; SSE-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
-; SSE-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
-; SSE-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[R0:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT:    [[R1:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT:    [[R2:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT:    [[R3:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT:    [[R4:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT:    [[R5:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT:    [[R6:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT:    [[R7:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT:    store i64 [[R0]], ptr @c64, align 8
-; SSE-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
-; SSE-NEXT:    ret void
+; SSE2-LABEL: @add_v8i64(
+; SSE2-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
+; SSE2-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; SSE2-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
+; SSE2-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
+; SSE2-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
+; SSE2-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; SSE2-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
+; SSE2-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
+; SSE2-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[R0:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A0]], i64 [[B0]])
+; SSE2-NEXT:    [[R1:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A1]], i64 [[B1]])
+; SSE2-NEXT:    [[R2:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A2]], i64 [[B2]])
+; SSE2-NEXT:    [[R3:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A3]], i64 [[B3]])
+; SSE2-NEXT:    [[R4:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A4]], i64 [[B4]])
+; SSE2-NEXT:    [[R5:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A5]], i64 [[B5]])
+; SSE2-NEXT:    [[R6:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A6]], i64 [[B6]])
+; SSE2-NEXT:    [[R7:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A7]], i64 [[B7]])
+; SSE2-NEXT:    store i64 [[R0]], ptr @c64, align 8
+; SSE2-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
+; SSE2-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
+; SSE2-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
+; SSE2-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; SSE2-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
+; SSE2-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
+; SSE2-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
+; SSE2-NEXT:    ret void
+;
+; SLM-LABEL: @add_v8i64(
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8
+; SLM-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+; SLM-NEXT:    store <2 x i64> [[TMP3]], ptr @c64, align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    store <2 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
+; SLM-NEXT:    store <2 x i64> [[TMP12]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
+; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
@@ -684,6 +703,3 @@ define void @add_v64i8() {
   store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; SLM: {{.*}}
-; SSE2: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
index 478751e6a2575..35d46e4c85be0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
@@ -61,35 +61,16 @@ define void @sub_v8i64() {
 ; SSE-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
 ; SSE-NEXT:    ret void
 ;
-; AVX1-LABEL: @sub_v8i64(
-; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
-; AVX1-NEXT:    store <2 x i64> [[TMP3]], ptr @c64, align 8
-; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; AVX1-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]])
-; AVX1-NEXT:    store <2 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
-; AVX1-NEXT:    store <2 x i64> [[TMP9]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX1-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; AVX1-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
-; AVX1-NEXT:    store <2 x i64> [[TMP12]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; AVX1-NEXT:    ret void
-;
-; AVX2-LABEL: @sub_v8i64(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX2-NEXT:    store <4 x i64> [[TMP3]], ptr @c64, align 8
-; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX2-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
-; AVX2-NEXT:    store <4 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX2-NEXT:    ret void
+; AVX-LABEL: @sub_v8i64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
+; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
+; AVX-NEXT:    store <4 x i64> [[TMP3]], ptr @c64, align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
+; AVX-NEXT:    store <4 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; AVX-NEXT:    ret void
 ;
 ; AVX512-LABEL: @sub_v8i64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
@@ -714,5 +695,7 @@ define void @sub_v64i8() {
   ret void
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX1: {{.*}}
+; AVX2: {{.*}}
 ; SLM: {{.*}}
 ; SSE2: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
index bfe989be8ca10..7777bf433136e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
@@ -26,40 +26,59 @@ declare i16 @llvm.usub.sat.i16(i16, i16)
 declare i8  @llvm.usub.sat.i8 (i8 , i8 )
 
 define void @sub_v8i64() {
-; SSE-LABEL: @sub_v8i64(
-; SSE-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
-; SSE-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
-; SSE-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT:    [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT:    [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT:    [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT:    [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT:    [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT:    [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT:    [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT:    store i64 [[R0]], ptr @c64, align 8
-; SSE-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
-; SSE-NEXT:    ret void
+; SSE2-LABEL: @sub_v8i64(
+; SSE2-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
+; SSE2-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; SSE2-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
+; SSE2-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
+; SSE2-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
+; SSE2-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; SSE2-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
+; SSE2-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
+; SSE2-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])
+; SSE2-NEXT:    [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])
+; SSE2-NEXT:    [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])
+; SSE2-NEXT:    [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])
+; SSE2-NEXT:    [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])
+; SSE2-NEXT:    [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])
+; SSE2-NEXT:    [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])
+; SSE2-NEXT:    [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])
+; SSE2-NEXT:    store i64 [[R0]], ptr @c64, align 8
+; SSE2-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
+; SSE2-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
+; SSE2-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
+; SSE2-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; SSE2-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
+; SSE2-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
+; SSE2-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
+; SSE2-NEXT:    ret void
+;
+; SLM-LABEL: @sub_v8i64(
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8
+; SLM-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+; SLM-NEXT:    store <2 x i64> [[TMP3]], ptr @c64, align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    store <2 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
+; SLM-NEXT:    store <2 x i64> [[TMP12]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
+; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
@@ -684,6 +703,3 @@ define void @sub_v64i8() {
   store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; SLM: {{.*}}
-; SSE2: {{.*}}

>From a79db96ec0decca4fe45579e039cf5589345b3ed Mon Sep 17 00:00:00 2001
From: Kai Nacke <kai.peter.nacke at ibm.com>
Date: Thu, 25 Jul 2024 09:41:55 -0400
Subject: [PATCH 141/259] [GISel][TableGen] Generate getRegBankFromRegClass
 (#99896)

Generating the mapping from a register class to a register bank is
complex:
 - there can be lots of register classes
 - the mapping may be ambiguos
- a register class can span several register banks (e.g. a register
class containing all registers)
- the type information is not enough to decide which register bank to
map to (e.g. a register class containing floating point and vector
registers, and all register can represent a f64 value)

The approach taken here is to encode the register banks in an array
indexed by the ID of the register class. To save space, the entries are
packed into chunks of size 2^n.
---
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp | 49 +---------
 .../AArch64/GISel/AArch64RegisterBankInfo.h   |  2 +-
 llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp   | 38 --------
 llvm/lib/Target/ARM/ARMRegisterBankInfo.h     |  3 -
 .../M68k/GISel/M68kRegisterBankInfo.cpp       |  6 --
 .../Target/M68k/GISel/M68kRegisterBankInfo.h  |  3 -
 llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp | 29 ------
 llvm/lib/Target/Mips/MipsRegisterBankInfo.h   |  3 -
 .../PowerPC/GISel/PPCRegisterBankInfo.cpp     | 20 +---
 .../PowerPC/GISel/PPCRegisterBankInfo.h       |  1 +
 .../RISCV/GISel/RISCVRegisterBankInfo.cpp     | 45 ---------
 .../RISCV/GISel/RISCVRegisterBankInfo.h       |  3 -
 .../Target/SPIRV/SPIRVRegisterBankInfo.cpp    |  1 +
 .../Target/X86/GISel/X86RegisterBankInfo.cpp  | 27 ------
 .../Target/X86/GISel/X86RegisterBankInfo.h    |  3 -
 llvm/test/TableGen/RegBankFromRegClass.td     | 45 +++++++++
 llvm/utils/TableGen/RegisterBankEmitter.cpp   | 94 ++++++++++++++++++-
 17 files changed, 143 insertions(+), 229 deletions(-)
 create mode 100644 llvm/test/TableGen/RegBankFromRegClass.td

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 220ddbf10a1c2..23e135063147a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -241,57 +241,12 @@ unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A,
 
 const RegisterBank &
 AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                                LLT) const {
+                                                LLT Ty) const {
   switch (RC.getID()) {
-  case AArch64::FPR8RegClassID:
-  case AArch64::FPR16RegClassID:
-  case AArch64::FPR16_loRegClassID:
-  case AArch64::FPR32_with_hsub_in_FPR16_loRegClassID:
-  case AArch64::FPR32RegClassID:
-  case AArch64::FPR64RegClassID:
-  case AArch64::FPR128RegClassID:
-  case AArch64::FPR64_loRegClassID:
-  case AArch64::FPR128_loRegClassID:
-  case AArch64::FPR128_0to7RegClassID:
-  case AArch64::DDRegClassID:
-  case AArch64::DDDRegClassID:
-  case AArch64::DDDDRegClassID:
-  case AArch64::QQRegClassID:
-  case AArch64::QQQRegClassID:
-  case AArch64::QQQQRegClassID:
-  case AArch64::ZPRRegClassID:
-  case AArch64::ZPR_3bRegClassID:
-    return getRegBank(AArch64::FPRRegBankID);
-  case AArch64::GPR32commonRegClassID:
-  case AArch64::GPR32RegClassID:
-  case AArch64::GPR32spRegClassID:
-  case AArch64::GPR32sponlyRegClassID:
-  case AArch64::GPR32argRegClassID:
-  case AArch64::GPR32allRegClassID:
-  case AArch64::GPR64commonRegClassID:
-  case AArch64::GPR64RegClassID:
-  case AArch64::GPR64spRegClassID:
   case AArch64::GPR64sponlyRegClassID:
-  case AArch64::GPR64argRegClassID:
-  case AArch64::GPR64allRegClassID:
-  case AArch64::GPR64noipRegClassID:
-  case AArch64::GPR64common_and_GPR64noipRegClassID:
-  case AArch64::GPR64noip_and_tcGPR64RegClassID:
-  case AArch64::tcGPR64RegClassID:
-  case AArch64::tcGPRx16x17RegClassID:
-  case AArch64::tcGPRx17RegClassID:
-  case AArch64::tcGPRnotx16RegClassID:
-  case AArch64::WSeqPairsClassRegClassID:
-  case AArch64::XSeqPairsClassRegClassID:
-  case AArch64::MatrixIndexGPR32_8_11RegClassID:
-  case AArch64::MatrixIndexGPR32_12_15RegClassID:
-  case AArch64::GPR64_with_sub_32_in_MatrixIndexGPR32_8_11RegClassID:
-  case AArch64::GPR64_with_sub_32_in_MatrixIndexGPR32_12_15RegClassID:
     return getRegBank(AArch64::GPRRegBankID);
-  case AArch64::CCRRegClassID:
-    return getRegBank(AArch64::CCRegBankID);
   default:
-    llvm_unreachable("Register class not supported");
+    return AArch64GenRegisterBankInfo::getRegBankFromRegClass(RC, Ty);
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
index 0d89f540650a9..941499b08d05d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
@@ -150,7 +150,7 @@ class AArch64RegisterBankInfo final : public AArch64GenRegisterBankInfo {
                     TypeSize Size) const override;
 
   const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                             LLT) const override;
+                                             LLT Ty) const override;
 
   InstructionMappings
   getInstrAlternativeMappings(const MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 9234881c9407e..447db18b8defa 100644
--- a/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -170,44 +170,6 @@ ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI) {
   llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
 }
 
-const RegisterBank &
-ARMRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                            LLT) const {
-  using namespace ARM;
-
-  switch (RC.getID()) {
-  case GPRRegClassID:
-  case GPRwithAPSRRegClassID:
-  case GPRnoipRegClassID:
-  case GPRnopcRegClassID:
-  case GPRnoip_and_GPRnopcRegClassID:
-  case rGPRRegClassID:
-  case GPRspRegClassID:
-  case tcGPRRegClassID:
-  case tcGPRnotr12RegClassID:
-  case tGPRRegClassID:
-  case tGPREvenRegClassID:
-  case tGPROddRegClassID:
-  case tGPR_and_tGPREvenRegClassID:
-  case tGPR_and_tGPROddRegClassID:
-  case tGPREven_and_tcGPRRegClassID:
-  case tGPROdd_and_tcGPRRegClassID:
-  case tGPREven_and_tcGPRnotr12RegClassID:
-    return getRegBank(ARM::GPRRegBankID);
-  case HPRRegClassID:
-  case SPR_8RegClassID:
-  case SPRRegClassID:
-  case DPR_8RegClassID:
-  case DPRRegClassID:
-  case QPRRegClassID:
-    return getRegBank(ARM::FPRRegBankID);
-  default:
-    llvm_unreachable("Unsupported register kind");
-  }
-
-  llvm_unreachable("Switch should handle all register classes");
-}
-
 const RegisterBankInfo::InstructionMapping &
 ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   auto Opc = MI.getOpcode();
diff --git a/llvm/lib/Target/ARM/ARMRegisterBankInfo.h b/llvm/lib/Target/ARM/ARMRegisterBankInfo.h
index c56134aab38c6..2694174623c5c 100644
--- a/llvm/lib/Target/ARM/ARMRegisterBankInfo.h
+++ b/llvm/lib/Target/ARM/ARMRegisterBankInfo.h
@@ -32,9 +32,6 @@ class ARMRegisterBankInfo final : public ARMGenRegisterBankInfo {
 public:
   ARMRegisterBankInfo(const TargetRegisterInfo &TRI);
 
-  const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                             LLT) const override;
-
   const InstructionMapping &
   getInstrMapping(const MachineInstr &MI) const override;
 };
diff --git a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp
index e7e5bb19c3a07..16d009ab81adc 100644
--- a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp
+++ b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp
@@ -58,12 +58,6 @@ const RegisterBankInfo::ValueMapping ValueMappings[] = {
 M68kRegisterBankInfo::M68kRegisterBankInfo(const TargetRegisterInfo &TRI)
     : M68kGenRegisterBankInfo() {}
 
-const RegisterBank &
-M68kRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                             LLT) const {
-  return getRegBank(M68k::GPRRegBankID);
-}
-
 const RegisterBankInfo::InstructionMapping &
 M68kRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   auto Opc = MI.getOpcode();
diff --git a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h
index 493c139f018cd..6122db8b48989 100644
--- a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h
+++ b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h
@@ -35,9 +35,6 @@ class M68kRegisterBankInfo final : public M68kGenRegisterBankInfo {
 public:
   M68kRegisterBankInfo(const TargetRegisterInfo &TRI);
 
-  const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                             LLT) const override;
-
   const InstructionMapping &
   getInstrMapping(const MachineInstr &MI) const override;
 };
diff --git a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
index 62b58cba9f24a..4aecaf18db480 100644
--- a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -75,35 +75,6 @@ using namespace llvm;
 
 MipsRegisterBankInfo::MipsRegisterBankInfo(const TargetRegisterInfo &TRI) {}
 
-const RegisterBank &
-MipsRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                             LLT) const {
-  using namespace Mips;
-
-  switch (RC.getID()) {
-  case Mips::GPR32RegClassID:
-  case Mips::CPU16Regs_and_GPRMM16ZeroRegClassID:
-  case Mips::GPRMM16MovePPairFirstRegClassID:
-  case Mips::CPU16Regs_and_GPRMM16MovePPairSecondRegClassID:
-  case Mips::GPRMM16MoveP_and_CPU16Regs_and_GPRMM16ZeroRegClassID:
-  case Mips::GPRMM16MovePPairFirst_and_GPRMM16MovePPairSecondRegClassID:
-  case Mips::SP32RegClassID:
-  case Mips::GP32RegClassID:
-    return getRegBank(Mips::GPRBRegBankID);
-  case Mips::FGRCCRegClassID:
-  case Mips::FGR32RegClassID:
-  case Mips::FGR64RegClassID:
-  case Mips::AFGR64RegClassID:
-  case Mips::MSA128BRegClassID:
-  case Mips::MSA128HRegClassID:
-  case Mips::MSA128WRegClassID:
-  case Mips::MSA128DRegClassID:
-    return getRegBank(Mips::FPRBRegBankID);
-  default:
-    llvm_unreachable("Register class not supported");
-  }
-}
-
 // Instructions where use operands are floating point registers.
 // Def operands are general purpose.
 static bool isFloatingPointOpcodeUse(unsigned Opc) {
diff --git a/llvm/lib/Target/Mips/MipsRegisterBankInfo.h b/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
index bc424b93f6056..1e07962fd02b3 100644
--- a/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
+++ b/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
@@ -32,9 +32,6 @@ class MipsRegisterBankInfo final : public MipsGenRegisterBankInfo {
 public:
   MipsRegisterBankInfo(const TargetRegisterInfo &TRI);
 
-  const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                             LLT) const override;
-
   const InstructionMapping &
   getInstrMapping(const MachineInstr &MI) const override;
 
diff --git a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp
index 125a49de7b27d..4a004f8960562 100644
--- a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp
+++ b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp
@@ -34,13 +34,6 @@ const RegisterBank &
 PPCRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
                                             LLT Ty) const {
   switch (RC.getID()) {
-  case PPC::G8RCRegClassID:
-  case PPC::G8RC_NOX0RegClassID:
-  case PPC::G8RC_and_G8RC_NOX0RegClassID:
-  case PPC::GPRCRegClassID:
-  case PPC::GPRC_NOR0RegClassID:
-  case PPC::GPRC_and_GPRC_NOR0RegClassID:
-    return getRegBank(PPC::GPRRegBankID);
   case PPC::VSFRCRegClassID:
   case PPC::SPILLTOVSRRC_and_VSFRCRegClassID:
   case PPC::SPILLTOVSRRC_and_VFRCRegClassID:
@@ -50,19 +43,8 @@ PPCRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   case PPC::VSSRCRegClassID:
   case PPC::F4RCRegClassID:
     return getRegBank(PPC::FPRRegBankID);
-  case PPC::VSRCRegClassID:
-  case PPC::VRRCRegClassID:
-  case PPC::VRRC_with_sub_64_in_SPILLTOVSRRCRegClassID:
-  case PPC::VSRC_with_sub_64_in_SPILLTOVSRRCRegClassID:
-  case PPC::SPILLTOVSRRCRegClassID:
-  case PPC::VSLRCRegClassID:
-  case PPC::VSLRC_with_sub_64_in_SPILLTOVSRRCRegClassID:
-    return getRegBank(PPC::VECRegBankID);
-  case PPC::CRRCRegClassID:
-  case PPC::CRBITRCRegClassID:
-    return getRegBank(PPC::CRRegBankID);
   default:
-    llvm_unreachable("Unexpected register class");
+    return PPCGenRegisterBankInfo::getRegBankFromRegClass(RC, Ty);
   }
 }
 
diff --git a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h
index 1477fdca917d7..332a34fe022dd 100644
--- a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h
+++ b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h
@@ -67,6 +67,7 @@ class PPCRegisterBankInfo final : public PPCGenRegisterBankInfo {
 
   const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
                                              LLT Ty) const override;
+
   const InstructionMapping &
   getInstrMapping(const MachineInstr &MI) const override;
 
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
index d25e96525399e..43bbc8589e7e2 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
@@ -112,51 +112,6 @@ using namespace llvm;
 RISCVRegisterBankInfo::RISCVRegisterBankInfo(unsigned HwMode)
     : RISCVGenRegisterBankInfo(HwMode) {}
 
-const RegisterBank &
-RISCVRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                              LLT Ty) const {
-  switch (RC.getID()) {
-  default:
-    llvm_unreachable("Register class not supported");
-  case RISCV::GPRRegClassID:
-  case RISCV::GPRF16RegClassID:
-  case RISCV::GPRF32RegClassID:
-  case RISCV::GPRNoX0RegClassID:
-  case RISCV::GPRNoX0X2RegClassID:
-  case RISCV::GPRJALRRegClassID:
-  case RISCV::GPRJALRNonX7RegClassID:
-  case RISCV::GPRTCRegClassID:
-  case RISCV::GPRTCNonX7RegClassID:
-  case RISCV::GPRC_and_GPRTCRegClassID:
-  case RISCV::GPRCRegClassID:
-  case RISCV::GPRC_and_SR07RegClassID:
-  case RISCV::SR07RegClassID:
-  case RISCV::SPRegClassID:
-  case RISCV::GPRX0RegClassID:
-    return getRegBank(RISCV::GPRBRegBankID);
-  case RISCV::FPR64RegClassID:
-  case RISCV::FPR16RegClassID:
-  case RISCV::FPR32RegClassID:
-  case RISCV::FPR64CRegClassID:
-  case RISCV::FPR32CRegClassID:
-    return getRegBank(RISCV::FPRBRegBankID);
-  case RISCV::VMRegClassID:
-  case RISCV::VRRegClassID:
-  case RISCV::VRNoV0RegClassID:
-  case RISCV::VRM2RegClassID:
-  case RISCV::VRM2NoV0RegClassID:
-  case RISCV::VRM4RegClassID:
-  case RISCV::VRM4NoV0RegClassID:
-  case RISCV::VMV0RegClassID:
-  case RISCV::VRM2_with_sub_vrm1_0_in_VMV0RegClassID:
-  case RISCV::VRM4_with_sub_vrm1_0_in_VMV0RegClassID:
-  case RISCV::VRM8RegClassID:
-  case RISCV::VRM8NoV0RegClassID:
-  case RISCV::VRM8_with_sub_vrm1_0_in_VMV0RegClassID:
-    return getRegBank(RISCV::VRBRegBankID);
-  }
-}
-
 static const RegisterBankInfo::ValueMapping *getFPValueMapping(unsigned Size) {
   unsigned Idx;
   switch (Size) {
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.h b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.h
index abd0837395f66..79dddb73a2373 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.h
+++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.h
@@ -33,9 +33,6 @@ class RISCVRegisterBankInfo final : public RISCVGenRegisterBankInfo {
 public:
   RISCVRegisterBankInfo(unsigned HwMode);
 
-  const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                             LLT Ty) const override;
-
   const InstructionMapping &
   getInstrMapping(const MachineInstr &MI) const override;
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp
index ecd99f1840d7e..0ed45466788a0 100644
--- a/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "SPIRVRegisterBankInfo.h"
 #include "SPIRVRegisterInfo.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/RegisterBank.h"
 
 #define GET_REGINFO_ENUM
diff --git a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
index 9e85424e76e62..61633a09d93cf 100644
--- a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
@@ -44,33 +44,6 @@ X86RegisterBankInfo::X86RegisterBankInfo(const TargetRegisterInfo &TRI) {
          "GPRs should hold up to 64-bit");
 }
 
-const RegisterBank &
-X86RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                            LLT) const {
-
-  if (X86::GR8RegClass.hasSubClassEq(&RC) ||
-      X86::GR16RegClass.hasSubClassEq(&RC) ||
-      X86::GR32RegClass.hasSubClassEq(&RC) ||
-      X86::GR64RegClass.hasSubClassEq(&RC) ||
-      X86::LOW32_ADDR_ACCESSRegClass.hasSubClassEq(&RC) ||
-      X86::LOW32_ADDR_ACCESS_RBPRegClass.hasSubClassEq(&RC))
-    return getRegBank(X86::GPRRegBankID);
-
-  if (X86::FR32XRegClass.hasSubClassEq(&RC) ||
-      X86::FR64XRegClass.hasSubClassEq(&RC) ||
-      X86::VR128XRegClass.hasSubClassEq(&RC) ||
-      X86::VR256XRegClass.hasSubClassEq(&RC) ||
-      X86::VR512RegClass.hasSubClassEq(&RC))
-    return getRegBank(X86::VECRRegBankID);
-
-  if (X86::RFP80RegClass.hasSubClassEq(&RC) ||
-      X86::RFP32RegClass.hasSubClassEq(&RC) ||
-      X86::RFP64RegClass.hasSubClassEq(&RC))
-    return getRegBank(X86::PSRRegBankID);
-
-  llvm_unreachable("Unsupported register kind yet.");
-}
-
 // \returns true if a given intrinsic only uses and defines FPRs.
 static bool isFPIntrinsic(const MachineRegisterInfo &MRI,
                           const MachineInstr &MI) {
diff --git a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.h b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.h
index 8f38e717e36b0..f30e9fea1a597 100644
--- a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.h
+++ b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.h
@@ -81,9 +81,6 @@ class X86RegisterBankInfo final : public X86GenRegisterBankInfo {
 public:
   X86RegisterBankInfo(const TargetRegisterInfo &TRI);
 
-  const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
-                                             LLT) const override;
-
   InstructionMappings
   getInstrAlternativeMappings(const MachineInstr &MI) const override;
 
diff --git a/llvm/test/TableGen/RegBankFromRegClass.td b/llvm/test/TableGen/RegBankFromRegClass.td
new file mode 100644
index 0000000000000..072a436035052
--- /dev/null
+++ b/llvm/test/TableGen/RegBankFromRegClass.td
@@ -0,0 +1,45 @@
+// RUN: llvm-tblgen -gen-register-bank -I %p/../../include %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+
+def MyTarget : Target;
+
+def R0 : Register<"r0">;
+def GR : RegisterClass<"MyTarget", [i32], 32, (add R0)>;
+
+def F0 : Register<"f0">;
+def FR : RegisterClass<"MyTarget", [f32], 32, (add F0)>;
+
+def V0 : Register<"V0">;
+def VR : RegisterClass<"MyTarget", [v4i8, f32], 32, (add V0)>;
+
+def AllFloatR : RegisterClass<"MyTarget", [f32], 32, (add F0, V0)>;
+def AnyR : RegisterClass<"MyTarget", [i32, f32, v4i8], 32, (add R0, F0, V0)>;
+
+def GRRegBank : RegisterBank<"GRB", [GR]>;
+def FRRegBank : RegisterBank<"FRB", [FR]>;
+def VRRegBank : RegisterBank<"VRB", [VR]>;
+
+
+// CHECK:      #ifdef GET_TARGET_REGBANK_CLASS
+// CHECK:        const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC, LLT Ty) const override;
+
+// CHECK:      #ifdef GET_TARGET_REGBANK_IMPL
+// CHECK:      const RegisterBank &
+// CHECK-NEXT: MyTargetGenRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const {
+// CHECK-NEXT:   constexpr uint32_t InvalidRegBankID = uint32_t(MyTarget::InvalidRegBankID) & 3;
+// CHECK-NEXT:   static const uint32_t RegClass2RegBank[1] = {
+// CHECK-NEXT:     (uint32_t(InvalidRegBankID) << 0) |
+// CHECK-NEXT:     (uint32_t(InvalidRegBankID) << 2) |
+// CHECK-NEXT:     (uint32_t(MyTarget::FRRegBankID) << 4) | // FRRegClassID
+// CHECK-NEXT:     (uint32_t(MyTarget::GRRegBankID) << 6) | // GRRegClassID
+// CHECK-NEXT:     (uint32_t(MyTarget::VRRegBankID) << 8) // VRRegClassID
+// CHECK-NEXT:   };
+// CHECK-NEXT:   const unsigned RegClassID = RC.getID();
+// CHECK-NEXT:   if (LLVM_LIKELY(RegClassID < 5)) {
+// CHECK-NEXT:     unsigned RegBankID = (RegClass2RegBank[RegClassID / 16] >> ((RegClassID % 16) * 2)) & 3;
+// CHECK-NEXT:     if (RegBankID != InvalidRegBankID)
+// CHECK-NEXT:       return getRegBank(RegBankID);
+// CHECK-NEXT:   }
+// CHECK-NEXT:   llvm_unreachable(llvm::Twine("Target needs to handle register class ID 0x").concat(llvm::Twine::utohexstr(RegClassID)).str().c_str());
+// CHECK-NEXT: }
diff --git a/llvm/utils/TableGen/RegisterBankEmitter.cpp b/llvm/utils/TableGen/RegisterBankEmitter.cpp
index 5546e727af384..6872f16df4724 100644
--- a/llvm/utils/TableGen/RegisterBankEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterBankEmitter.cpp
@@ -16,6 +16,7 @@
 #include "Common/InfoByHwMode.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
@@ -151,6 +152,9 @@ void RegisterBankEmitter::emitBaseClassDefinition(
   OS << "private:\n"
      << "  static const RegisterBank *RegBanks[];\n"
      << "  static const unsigned Sizes[];\n\n"
+     << "public:\n"
+     << "  const RegisterBank &getRegBankFromRegClass(const "
+        "TargetRegisterClass &RC, LLT Ty) const override;\n"
      << "protected:\n"
      << "  " << TargetName << "GenRegisterBankInfo(unsigned HwMode = 0);\n"
      << "\n";
@@ -287,8 +291,94 @@ void RegisterBankEmitter::emitBaseClassImplementation(
      << "  for (auto RB : enumerate(RegBanks))\n"
      << "    assert(RB.index() == RB.value()->getID() && \"Index != ID\");\n"
      << "#endif // NDEBUG\n"
-     << "}\n"
-     << "} // end namespace llvm\n";
+     << "}\n";
+
+  uint32_t NumRegBanks = Banks.size();
+  uint32_t BitSize = NextPowerOf2(Log2_32(NumRegBanks));
+  uint32_t ElemsPerWord = 32 / BitSize;
+  uint32_t BitMask = (1 << BitSize) - 1;
+  bool HasAmbigousOrMissingEntry = false;
+  struct Entry {
+    std::string RCIdName;
+    std::string RBIdName;
+  };
+  SmallVector<Entry, 0> Entries;
+  for (const auto &Bank : Banks) {
+    for (const auto *RC : Bank.register_classes()) {
+      if (RC->EnumValue >= Entries.size())
+        Entries.resize(RC->EnumValue + 1);
+      Entry &E = Entries[RC->EnumValue];
+      E.RCIdName = RC->getIdName();
+      if (!E.RBIdName.empty()) {
+        HasAmbigousOrMissingEntry = true;
+        E.RBIdName = "InvalidRegBankID";
+      } else {
+        E.RBIdName = (TargetName + "::" + Bank.getEnumeratorName()).str();
+      }
+    }
+  }
+  for (auto &E : Entries) {
+    if (E.RBIdName.empty()) {
+      HasAmbigousOrMissingEntry = true;
+      E.RBIdName = "InvalidRegBankID";
+    }
+  }
+  OS << "const RegisterBank &\n"
+     << TargetName
+     << "GenRegisterBankInfo::getRegBankFromRegClass"
+        "(const TargetRegisterClass &RC, LLT) const {\n";
+  if (HasAmbigousOrMissingEntry) {
+    OS << "  constexpr uint32_t InvalidRegBankID = uint32_t("
+       << TargetName + "::InvalidRegBankID) & " << BitMask << ";\n";
+  }
+  unsigned TableSize =
+      Entries.size() / ElemsPerWord + ((Entries.size() % ElemsPerWord) > 0);
+  OS << "  static const uint32_t RegClass2RegBank[" << TableSize << "] = {\n";
+  uint32_t Shift = 32 - BitSize;
+  bool First = true;
+  std::string TrailingComment;
+  for (auto &E : Entries) {
+    Shift += BitSize;
+    if (Shift == 32) {
+      Shift = 0;
+      if (First)
+        First = false;
+      else
+        OS << ',' << TrailingComment << '\n';
+    } else {
+      OS << " |" << TrailingComment << '\n';
+    }
+    OS << "    ("
+       << (E.RBIdName.empty()
+               ? "InvalidRegBankID"
+               : Twine("uint32_t(").concat(E.RBIdName).concat(")").str())
+       << " << " << Shift << ')';
+    if (!E.RCIdName.empty())
+      TrailingComment = " // " + E.RCIdName;
+    else
+      TrailingComment = "";
+  }
+  OS << TrailingComment
+     << "\n  };\n"
+        "  const unsigned RegClassID = RC.getID();\n"
+        "  if (LLVM_LIKELY(RegClassID < "
+     << Entries.size()
+     << ")) {\n"
+        "    unsigned RegBankID = (RegClass2RegBank[RegClassID / "
+     << ElemsPerWord << "] >> ((RegClassID % " << ElemsPerWord << ") * "
+     << BitSize << ")) & " << BitMask << ";\n";
+  if (HasAmbigousOrMissingEntry) {
+    OS << "    if (RegBankID != InvalidRegBankID)\n"
+          "      return getRegBank(RegBankID);\n";
+  } else
+    OS << "    return getRegBank(RegBankID);\n";
+  OS << "  }\n"
+        "  llvm_unreachable(llvm::Twine(\"Target needs to handle register "
+        "class ID "
+        "0x\").concat(llvm::Twine::utohexstr(RegClassID)).str().c_str());\n"
+        "}\n";
+
+  OS << "} // end namespace llvm\n";
 }
 
 void RegisterBankEmitter::run(raw_ostream &OS) {

>From d311edd0eff2095b1e5e73b4e612e7ddd8d93d70 Mon Sep 17 00:00:00 2001
From: Chen Zheng <czhengsz at cn.ibm.com>
Date: Thu, 25 Jul 2024 09:21:55 -0400
Subject: [PATCH 142/259] [PowerPC] fix default cpu setting

for platform that returns nothing for getHostCPUName()
For example for target ARM on windows. For this case, -mcpu=native
should set CPU to the default according to triple instead of setting
CPU to "native"

Fixes https://lab.llvm.org/buildbot/#/builders/161/builds/873 caused by
https://github.com/llvm/llvm-project/pull/97541
---
 llvm/lib/TargetParser/PPCTargetParser.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/TargetParser/PPCTargetParser.cpp b/llvm/lib/TargetParser/PPCTargetParser.cpp
index c773c326f1841..422d758c772e1 100644
--- a/llvm/lib/TargetParser/PPCTargetParser.cpp
+++ b/llvm/lib/TargetParser/PPCTargetParser.cpp
@@ -97,7 +97,7 @@ StringRef getNormalizedPPCTargetCPU(const Triple &T, StringRef CPUName) {
     }
 
     StringRef CPU = normalizeCPUName(CPUName);
-    if (CPU != "generic")
+    if (CPU != "generic" && CPU != "native")
       return CPU;
   }
 

>From 9ad92c476907451bdc95a6c2b6a6bbba03815b4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Thu, 25 Jul 2024 15:56:17 +0200
Subject: [PATCH 143/259] [analyzer][NFC] Minor cleanup in two test files.
 (#100570)

This commit contains two unrelated trivial changes:

(1) Three unused variables are removed from `ctor.mm`.
(2) A FIXME block is removed from `ctor-array.cpp` because it described
    an issue that was resolved since then.
---
 clang/test/Analysis/ctor-array.cpp | 12 +-----------
 clang/test/Analysis/ctor.mm        |  2 --
 2 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/clang/test/Analysis/ctor-array.cpp b/clang/test/Analysis/ctor-array.cpp
index 053669cc2aada..49412ee5a68c7 100644
--- a/clang/test/Analysis/ctor-array.cpp
+++ b/clang/test/Analysis/ctor-array.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -analyzer-disable-checker=cplusplus -analyzer-config c++-inlining=constructors -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -analyzer-config c++-inlining=constructors -verify %s
 
 #include "Inputs/system-header-simulator-cxx.h"
 
@@ -119,16 +119,6 @@ struct s5 {
 };
 
 void g1(void) {
-  // FIXME: This test requires -analyzer-disable-checker=cplusplus,
-  // because of the checker's weird behaviour in case of arrays.
-  // E.g.:
-  //        s3 *arr = new s3[4];
-  //        s3 *arr2 = new (arr + 1) s3[1];
-  //                   ^~~~~~~~~~~~~~~~~~~
-  //                   warning: 12 bytes is possibly not enough
-  //                            for array allocation which requires
-  //                            4 bytes.
-
   s5::c = 0;
   s5 *arr = new s5[4];
   new (arr + 1) s5[3];
diff --git a/clang/test/Analysis/ctor.mm b/clang/test/Analysis/ctor.mm
index fb385833df9c7..6ac9050fc29f7 100644
--- a/clang/test/Analysis/ctor.mm
+++ b/clang/test/Analysis/ctor.mm
@@ -56,8 +56,6 @@ void testNonPODCopyConstructor() {
 namespace ConstructorVirtualCalls {
   class A {
   public:
-    int *out1, *out2, *out3;
-
     virtual int get() { return 1; }
 
     A(int *out1) {

>From 12dba4d48461c571cb5bdb7c3ac6078b9a357a02 Mon Sep 17 00:00:00 2001
From: weiwei chen <weiwei.chen at modular.com>
Date: Thu, 25 Jul 2024 10:01:46 -0400
Subject: [PATCH 144/259] [mlir] Add metadata to Diagnostic. (#99398)

Add metadata to Diagnostic.

Motivation: we have a use case where we want to do some filtering in our
customized Diagnostic Handler based on some customized info that is not
`location` or `severity` or `diagnostic arguments` that are member
variables of `Diagnostic`. Specifically, we want to add a unique ID to
the `Diagnostic` for the handler to filter in a compiler pass that emits
errors in async tasks with multithreading and the diagnostic handling is
associated to the task.

This patch adds a field of `metadata` to `mlir::Diagnostics` as a
general solution. `metadata` is of type `SmallVector<DiagnosticArgument,
0>` to save memory size and reuse existing `DiagnosticArgument` for
metadata type.
---
 mlir/docs/Diagnostics.md                      | 12 +++-
 mlir/include/mlir/IR/Diagnostics.h            |  6 ++
 mlir/test/IR/diagnostic-handler-metadata.mlir | 19 ++++++
 mlir/test/lib/IR/CMakeLists.txt               |  1 +
 mlir/test/lib/IR/TestDiagnosticsMetadata.cpp  | 65 +++++++++++++++++++
 mlir/tools/mlir-opt/mlir-opt.cpp              |  2 +
 6 files changed, 104 insertions(+), 1 deletion(-)
 create mode 100644 mlir/test/IR/diagnostic-handler-metadata.mlir
 create mode 100644 mlir/test/lib/IR/TestDiagnosticsMetadata.cpp

diff --git a/mlir/docs/Diagnostics.md b/mlir/docs/Diagnostics.md
index 82bc61dd8c3ad..a6d59bdecbdd8 100644
--- a/mlir/docs/Diagnostics.md
+++ b/mlir/docs/Diagnostics.md
@@ -77,7 +77,7 @@ InFlightDiagnostic Operation::emitOpError();
 ## Diagnostic
 
 A `Diagnostic` in MLIR contains all of the necessary information for reporting a
-message to the user. A `Diagnostic` essentially boils down to three main
+message to the user. A `Diagnostic` essentially boils down to four main
 components:
 
 *   [Source Location](#source-locations)
@@ -85,6 +85,11 @@ components:
     -   Error, Note, Remark, Warning
 *   Diagnostic Arguments
     -   The diagnostic arguments are used when constructing the output message.
+*   Metadata
+    -   Some additional information attached that can be used to identify 
+        this diagnostic other than source location and severity level 
+        (e.g. for diagnostic handlers to do some filtering). 
+        Metadata is not part of the output message.
 
 ### Appending arguments
 
@@ -143,6 +148,11 @@ op->emitError("...").attachNote(noteLoc) << "...";
 op->emitError("...").attachNote() << "...";
 ```
 
+### Managing Metadata
+Metadata is a mutable vector of DiagnosticArguments. 
+It can be accessed and modified as a vector. 
+
+
 ## InFlight Diagnostic
 
 Now that [Diagnostics](#diagnostic) have been explained, we introduce the
diff --git a/mlir/include/mlir/IR/Diagnostics.h b/mlir/include/mlir/IR/Diagnostics.h
index bb2e1bb183e9b..cb30bb3f59688 100644
--- a/mlir/include/mlir/IR/Diagnostics.h
+++ b/mlir/include/mlir/IR/Diagnostics.h
@@ -271,6 +271,9 @@ class Diagnostic {
     return failure();
   }
 
+  /// Returns the current list of diagnostic metadata.
+  SmallVectorImpl<DiagnosticArgument> &getMetadata() { return metadata; }
+
 private:
   Diagnostic(const Diagnostic &rhs) = delete;
   Diagnostic &operator=(const Diagnostic &rhs) = delete;
@@ -290,6 +293,9 @@ class Diagnostic {
 
   /// A list of attached notes.
   NoteVector notes;
+
+  /// A list of metadata attached to this Diagnostic.
+  SmallVector<DiagnosticArgument, 0> metadata;
 };
 
 inline raw_ostream &operator<<(raw_ostream &os, const Diagnostic &diag) {
diff --git a/mlir/test/IR/diagnostic-handler-metadata.mlir b/mlir/test/IR/diagnostic-handler-metadata.mlir
new file mode 100644
index 0000000000000..c089185cc3570
--- /dev/null
+++ b/mlir/test/IR/diagnostic-handler-metadata.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(test-diagnostic-metadata))" -verify-diagnostics -o - 2>&1 | FileCheck %s
+// COM: This test verifies that diagnostic handler can filter the diagnostic based on its metadata
+// COM: whether to emit the errors.
+
+// CHECK-LABEL: Test 'test'
+func.func @test() {
+  // expected-error @+1 {{test diagnostic metadata}}
+  "test.emit_error"() {
+    // CHECK: attr = "emit_error"
+    attr = "emit_error"
+  } : () -> ()
+
+  "test.do_not_emit_error"() {
+    // CHECK: attr = "do_not_emit_error"
+    attr = "do_not_emit_error"
+  } : () -> ()
+
+  return
+}
diff --git a/mlir/test/lib/IR/CMakeLists.txt b/mlir/test/lib/IR/CMakeLists.txt
index faaa3bb8db24c..01297ad0a1148 100644
--- a/mlir/test/lib/IR/CMakeLists.txt
+++ b/mlir/test/lib/IR/CMakeLists.txt
@@ -6,6 +6,7 @@ add_mlir_library(MLIRTestIR
   TestBuiltinDistinctAttributes.cpp
   TestClone.cpp
   TestDiagnostics.cpp
+  TestDiagnosticsMetadata.cpp
   TestDominance.cpp
   TestFunc.cpp
   TestInterfaces.cpp
diff --git a/mlir/test/lib/IR/TestDiagnosticsMetadata.cpp b/mlir/test/lib/IR/TestDiagnosticsMetadata.cpp
new file mode 100644
index 0000000000000..5cb0193baa171
--- /dev/null
+++ b/mlir/test/lib/IR/TestDiagnosticsMetadata.cpp
@@ -0,0 +1,65 @@
+//===- TestDiagnosticsMetadata.cpp - Test Diagnostic Metatdata ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains test passes for constructing and resolving dominance
+// information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/Support/SourceMgr.h"
+
+using namespace mlir;
+
+namespace {
+struct TestDiagnosticMetadataPass
+    : public PassWrapper<TestDiagnosticMetadataPass,
+                         InterfacePass<SymbolOpInterface>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestDiagnosticMetadataPass)
+
+  StringRef getArgument() const final { return "test-diagnostic-metadata"; }
+  StringRef getDescription() const final { return "Test diagnostic metadata."; }
+  TestDiagnosticMetadataPass() = default;
+  TestDiagnosticMetadataPass(const TestDiagnosticMetadataPass &) {}
+
+  void runOnOperation() override {
+    llvm::errs() << "Test '" << getOperation().getName() << "'\n";
+
+    // Build a diagnostic handler that has filtering capabilities.
+    ScopedDiagnosticHandler handler(&getContext(), [](mlir::Diagnostic &diag) {
+      return mlir::success(
+          llvm::none_of(diag.getMetadata(), [](mlir::DiagnosticArgument &arg) {
+            return arg.getKind() == mlir::DiagnosticArgument::
+                                        DiagnosticArgumentKind::String &&
+                   arg.getAsString().contains("hello");
+          }));
+    });
+
+    // Emit a diagnostic for every operation with a valid loc.
+    getOperation()->walk([&](Operation *op) {
+      if (StringAttr strAttr = op->getAttrOfType<StringAttr>("attr")) {
+        if (strAttr.getValue() == "emit_error")
+          emitError(op->getLoc(), "test diagnostic metadata")
+              .getUnderlyingDiagnostic()
+              ->getMetadata()
+              .push_back(DiagnosticArgument("hello"));
+      }
+    });
+  }
+};
+
+} // namespace
+
+namespace mlir {
+namespace test {
+void registerTestDiagnosticsMetadataPass() {
+  PassRegistration<TestDiagnosticMetadataPass>{};
+}
+} // namespace test
+} // namespace mlir
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 0f29963da39bb..1842fa158e75a 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -92,6 +92,7 @@ void registerTestDataLayoutQuery();
 void registerTestDeadCodeAnalysisPass();
 void registerTestDecomposeCallGraphTypes();
 void registerTestDiagnosticsPass();
+void registerTestDiagnosticsMetadataPass();
 void registerTestDominancePass();
 void registerTestDynamicPipelinePass();
 void registerTestEmulateNarrowTypePass();
@@ -226,6 +227,7 @@ void registerTestPasses() {
   mlir::test::registerTestDeadCodeAnalysisPass();
   mlir::test::registerTestDecomposeCallGraphTypes();
   mlir::test::registerTestDiagnosticsPass();
+  mlir::test::registerTestDiagnosticsMetadataPass();
   mlir::test::registerTestDominancePass();
   mlir::test::registerTestDynamicPipelinePass();
   mlir::test::registerTestEmulateNarrowTypePass();

>From daf9d7f3881e4ffd1fd5988dc1b6223c733b6bef Mon Sep 17 00:00:00 2001
From: Chen Zheng <czhengsz at cn.ibm.com>
Date: Thu, 25 Jul 2024 10:11:52 -0400
Subject: [PATCH 145/259] [PowerPC] add testcaseis for PPC, NFC

For builtin __builtin_cpu_is and __builtin_cpu_supports
---
 clang/test/Sema/builtin-cpu-supports.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/clang/test/Sema/builtin-cpu-supports.c b/clang/test/Sema/builtin-cpu-supports.c
index 133410abaa711..4dace25b42a97 100644
--- a/clang/test/Sema/builtin-cpu-supports.c
+++ b/clang/test/Sema/builtin-cpu-supports.c
@@ -2,6 +2,9 @@
 // RUN: %clang_cc1 -fsyntax-only -triple aarch64-linux-gnu -verify %s
 // RUN: %clang_cc1 -fsyntax-only -triple riscv32-linux-gnu -verify %s
 // RUN: %clang_cc1 -fsyntax-only -triple riscv64-linux-gnu -verify %s
+// RUN: %clang_cc1 -fsyntax-only -triple powerpc64le-unknown-linux -verify %s
+// RUN: %clang_cc1 -fsyntax-only -triple powerpc64-unknown-aix7.2.0.0 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -triple powerpc-unknown-aix7.2.0.0 -verify %s
 
 extern void a(const char *);
 
@@ -45,5 +48,16 @@ int main(void) {
     a("vsx");
 #endif
 
+#ifdef __powerpc__
+  if (__builtin_cpu_is("garbage")) // expected-error {{invalid cpu name for builtin}}
+    a("vsx");
+
+  if (__builtin_cpu_is("power3")) // expected-error {{invalid cpu name for builtin}}
+    a("vsx");
+
+  if (__builtin_cpu_supports("garbage")) // expected-warning {{invalid cpu feature string for builtin}}
+    a("vsx");
+#endif
+
   return 0;
 }

>From a0c590795eae48903a013699fe84db335854fa80 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek at amd.com>
Date: Thu, 25 Jul 2024 09:20:18 -0500
Subject: [PATCH 146/259] [Frontend][OpenMP] Allow implicit clauses to fail to
 apply (#100460)

The `linear(x)` clause implies `firstprivate(x)` on the compound
construct if `x` is not an induction variable. With more construct
combinations coming in OpenMP 6.0, the `firstprivate` clause may not be
possible to apply, e.g. in "masked simd".
An additional benefit from this change is that it allows treating leaf
constructs as combined constructs with a single constituent. Otherwise,
a `linear` clause on a lone `simd` construct could imply a
`firstprivate` clause that can't be applied.
---
 .../Frontend/OpenMP/ConstructDecompositionT.h   | 10 ++++++++--
 .../Frontend/OpenMPDecompositionTest.cpp        | 17 +++++++++++++++++
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
index 349d862135d8c..b93bc594a82bf 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -1114,6 +1114,11 @@ bool ConstructDecompositionT<C, H>::applyClause(
 template <typename C, typename H> bool ConstructDecompositionT<C, H>::split() {
   bool success = true;
 
+  auto isImplicit = [this](const ClauseTy *node) {
+    return llvm::any_of(
+        implicit, [node](const ClauseTy &clause) { return &clause == node; });
+  };
+
   for (llvm::omp::Directive leaf :
        llvm::omp::getLeafConstructsOrSelf(construct))
     leafs.push_back(LeafReprInternal{leaf, /*clauses=*/{}});
@@ -1153,9 +1158,10 @@ template <typename C, typename H> bool ConstructDecompositionT<C, H>::split() {
   for (const ClauseTy *node : nodes) {
     if (skip(node))
       continue;
-    success =
-        success &&
+    bool result =
         std::visit([&](auto &&s) { return applyClause(s, node); }, node->u);
+    if (!isImplicit(node))
+      success = success && result;
   }
 
   // Apply "allocate".
diff --git a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
index c70341b5a86d2..f9541131e4b23 100644
--- a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
@@ -1100,4 +1100,21 @@ TEST_F(OpenMPDecompositionTest, Nowait1) {
   ASSERT_EQ(Dir1, "parallel");      // (23)
   ASSERT_EQ(Dir2, "for");           // (23)
 }
+
+// ---
+
+// Check that "simd linear(x)" does not fail despite the implied "firstprivate"
+// (which "simd" does not allow).
+TEST_F(OpenMPDecompositionTest, Misc1) {
+  omp::Object x{"x"};
+  omp::List<omp::Clause> Clauses{
+      {OMPC_linear,
+       omp::clause::Linear{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_simd, Clauses);
+  ASSERT_EQ(Dec.output.size(), 1u);
+  std::string Dir0 = stringify(Dec.output[0]);
+  ASSERT_EQ(Dir0, "simd linear(, , , (x)) lastprivate(, (x))");
+}
 } // namespace

>From 8758091a70393f71c5a75805f5cbde556f6dce22 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 25 Jul 2024 09:21:05 -0500
Subject: [PATCH 147/259] [LLVM] Add 'ExpandVariadicsPass' to LTO default
 pipeline (#100479)

Summary:
This pass expands variadic functions into non-variadic function calls
according to the target ABI. Currently, this is used as the lowering for
the NVPTX and AMDGPU targets.

This pass is currently only run late in the target's backend. However,
during LTO we want to run it before the inliner pass so that the
expanded functions can be inlined using standard heuristics. This pass
is a no-op for unsupported targets, so this won't apply to any code that
isn't already using it.
---
 llvm/lib/Passes/PassBuilderPipelines.cpp      |  4 ++
 llvm/test/Other/new-pm-lto-defaults.ll        |  1 +
 llvm/test/Transforms/PhaseOrdering/varargs.ll | 41 +++++++++++++++++++
 3 files changed, 46 insertions(+)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/varargs.ll

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 6f36bdad780ae..757b20dcd6693 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Transforms/IPO/DeadArgumentElimination.h"
 #include "llvm/Transforms/IPO/ElimAvailExtern.h"
 #include "llvm/Transforms/IPO/EmbedBitcodePass.h"
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/IPO/GlobalDCE.h"
@@ -1874,6 +1875,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM),
                                                 PTO.EagerlyInvalidateAnalyses));
 
+  // Lower variadic functions for supported targets prior to inlining.
+  MPM.addPass(ExpandVariadicsPass(ExpandVariadicsMode::Optimize));
+
   // Note: historically, the PruneEH pass was run first to deduce nounwind and
   // generally clean up exception handling overhead. It isn't clear this is
   // valuable as the inliner doesn't currently care whether it is inlining an
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
index d451d2897f673..5543472df685b 100644
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -69,6 +69,7 @@
 ; CHECK-O23SZ-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
 ; CHECK-EP-Peephole-NEXT: Running pass: NoOpFunctionPass
+; CHECK-O23SZ-NEXT: Running pass: ExpandVariadicsPass
 ; CHECK-O23SZ-NEXT: Running pass: ModuleInlinerWrapperPass
 ; CHECK-O23SZ-NEXT: Running analysis: InlineAdvisorAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: InlinerPass
diff --git a/llvm/test/Transforms/PhaseOrdering/varargs.ll b/llvm/test/Transforms/PhaseOrdering/varargs.ll
new file mode 100644
index 0000000000000..15b82f88502d7
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/varargs.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-- -S -passes='lto<O2>' < %s | FileCheck %s
+target triple = "amdgcn-amd-amdhsa"
+
+; We use the ExpandVariadics pass to lower variadic functions so they can be
+; inlined.
+
+define i32 @foo() {
+; CHECK-LABEL: define i32 @foo(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret i32 6
+;
+entry:
+  %call = tail call i32 (i32, ...) @vararg(i32 poison, i32 noundef 1, i32 noundef 2, i32 noundef 3)
+  ret i32 %call
+}
+
+define internal i32 @vararg(i32 %first, ...) {
+entry:
+  %vlist = alloca ptr, align 8
+  call void @llvm.va_start.p0(ptr %vlist)
+  %vlist.promoted = load ptr, ptr %vlist, align 8
+  %argp.a = getelementptr inbounds i8, ptr %vlist.promoted, i64 4
+  store ptr %argp.a, ptr %vlist, align 8
+  %a = load i32, ptr %vlist.promoted, align 4
+  %argp.b = getelementptr inbounds i8, ptr %vlist.promoted, i64 8
+  store ptr %argp.b, ptr %vlist, align 8
+  %b = load i32, ptr %argp.a, align 4
+  %sum = add nsw i32 %b, %a
+  %argp.c = getelementptr inbounds i8, ptr %vlist.promoted, i64 12
+  store ptr %argp.c, ptr %vlist, align 8
+  %c = load i32, ptr %argp.b, align 4
+  %ret = add nsw i32 %c, %sum
+  call void @llvm.va_end.p0(ptr %vlist)
+  ret i32 %ret
+}
+
+declare void @llvm.va_start.p0(ptr)
+
+declare void @llvm.va_end.p0(ptr)

>From 4db37a49a72bb9cff7a78e77439008c058383099 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 25 Jul 2024 09:38:36 -0500
Subject: [PATCH 148/259] [LLVM] Only run vararg test with the amdgpu target

---
 llvm/test/Transforms/PhaseOrdering/varargs.ll | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/test/Transforms/PhaseOrdering/varargs.ll b/llvm/test/Transforms/PhaseOrdering/varargs.ll
index 15b82f88502d7..c66a982a06a6d 100644
--- a/llvm/test/Transforms/PhaseOrdering/varargs.ll
+++ b/llvm/test/Transforms/PhaseOrdering/varargs.ll
@@ -2,6 +2,8 @@
 ; RUN: opt -mtriple=amdgcn-- -S -passes='lto<O2>' < %s | FileCheck %s
 target triple = "amdgcn-amd-amdhsa"
 
+; REQUIRES: amdgpu-registered-target
+
 ; We use the ExpandVariadics pass to lower variadic functions so they can be
 ; inlined.
 

>From 010dcfd85fb8b8c8826f25f27324e3a958dba5f8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 25 Jul 2024 16:01:05 +0100
Subject: [PATCH 149/259] [CostModel][X86] Improve add/sub/mul overflow
 intrinsic costs

Noticed due to x86 changes in #97463
---
 .../lib/Target/X86/X86TargetTransformInfo.cpp |  34 +-
 .../Analysis/CostModel/X86/arith-overflow.ll  | 460 +++++++++---------
 llvm/test/Analysis/CostModel/X86/costmodel.ll |   2 +-
 .../CostModel/X86/intrinsic-cost-kinds.ll     |   8 +-
 .../LoopVectorize/X86/interleave-cost.ll      |   4 +-
 .../Transforms/LoopVectorize/X86/pr35432.ll   |   2 +-
 .../Transforms/LoopVectorize/X86/pr54634.ll   |   2 +-
 7 files changed, 259 insertions(+), 253 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index ec8af2bc4c118..161fe137462c6 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4167,9 +4167,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::SMIN,       MVT::i64,     {  1,  3,  2,  3 } },
     { ISD::UMAX,       MVT::i64,     {  1,  3,  2,  3 } },
     { ISD::UMIN,       MVT::i64,     {  1,  3,  2,  3 } },
-    { ISD::SADDO,      MVT::i64,     {  1 } },
-    { ISD::UADDO,      MVT::i64,     {  1 } },
-    { ISD::UMULO,      MVT::i64,     {  2 } }, // mulq + seto
+    { ISD::SADDO,      MVT::i64,     {  2,  2,  4,  6 } },
+    { ISD::UADDO,      MVT::i64,     {  2,  2,  4,  6 } },
+    { ISD::SMULO,      MVT::i64,     {  4,  4,  4,  6 } },
+    { ISD::UMULO,      MVT::i64,     {  8,  8,  4,  7 } },
   };
   static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
     { ISD::ABS,        MVT::i32,     {  1,  2,  3,  3 } }, // SUB+XOR+SRA or SUB+CMOV
@@ -4231,15 +4232,18 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::UMIN,       MVT::i32,     {  1,  2,  2,  3 } },
     { ISD::UMIN,       MVT::i16,     {  1,  4,  2,  4 } },
     { ISD::UMIN,       MVT::i8,      {  1,  4,  2,  4 } },
-    { ISD::SADDO,      MVT::i32,     {  1 } },
-    { ISD::SADDO,      MVT::i16,     {  1 } },
-    { ISD::SADDO,      MVT::i8,      {  1 } },
-    { ISD::UADDO,      MVT::i32,     {  1 } },
-    { ISD::UADDO,      MVT::i16,     {  1 } },
-    { ISD::UADDO,      MVT::i8,      {  1 } },
-    { ISD::UMULO,      MVT::i32,     {  2 } }, // mul + seto
-    { ISD::UMULO,      MVT::i16,     {  2 } },
-    { ISD::UMULO,      MVT::i8,      {  2 } },
+    { ISD::SADDO,      MVT::i32,     {  2,  2,  4,  6 } },
+    { ISD::SADDO,      MVT::i16,     {  2,  2,  4,  6 } },
+    { ISD::SADDO,      MVT::i8,      {  2,  2,  4,  6 } },
+    { ISD::UADDO,      MVT::i32,     {  2,  2,  4,  6 } },
+    { ISD::UADDO,      MVT::i16,     {  2,  2,  4,  6 } },
+    { ISD::UADDO,      MVT::i8,      {  2,  2,  4,  6 } },
+    { ISD::SMULO,      MVT::i32,     {  2,  2,  4,  6 } },
+    { ISD::SMULO,      MVT::i16,     {  5,  5,  4,  6 } },
+    { ISD::SMULO,      MVT::i8,      {  6,  6,  4,  6 } },
+    { ISD::UMULO,      MVT::i32,     {  6,  6,  4,  8 } },
+    { ISD::UMULO,      MVT::i16,     {  6,  6,  4,  9 } },
+    { ISD::UMULO,      MVT::i8,      {  6,  6,  4,  6 } },
   };
 
   Type *RetTy = ICA.getReturnType();
@@ -4352,9 +4356,11 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     ISD = ISD::UADDO;
     OpTy = RetTy->getContainedType(0);
     break;
-  case Intrinsic::umul_with_overflow:
   case Intrinsic::smul_with_overflow:
-    // SMULO has same costs so don't duplicate.
+    ISD = ISD::SMULO;
+    OpTy = RetTy->getContainedType(0);
+    break;
+  case Intrinsic::umul_with_overflow:
     ISD = ISD::UMULO;
     OpTy = RetTy->getContainedType(0);
     break;
diff --git a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
index ba745262d1890..c5da46af04367 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
@@ -37,190 +37,190 @@ declare {<64 x i8>, <64 x i1>}  @llvm.sadd.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @sadd(i32 %arg) {
 ; SSSE3-LABEL: 'sadd'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'sadd'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'sadd'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sadd'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'sadd'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'sadd'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'sadd'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'sadd'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'sadd'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'sadd'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -275,190 +275,190 @@ declare {<64 x i8>, <64 x i1>}  @llvm.uadd.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @uadd(i32 %arg) {
 ; SSSE3-LABEL: 'uadd'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'uadd'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'uadd'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'uadd'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'uadd'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'uadd'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'uadd'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'uadd'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'uadd'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'uadd'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -513,190 +513,190 @@ declare {<64 x i8>, <64 x i1>}  @llvm.ssub.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @ssub(i32 %arg) {
 ; SSSE3-LABEL: 'ssub'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'ssub'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'ssub'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'ssub'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'ssub'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'ssub'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'ssub'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'ssub'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'ssub'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'ssub'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -751,190 +751,190 @@ declare {<64 x i8>, <64 x i1>}  @llvm.usub.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @usub(i32 %arg) {
 ; SSSE3-LABEL: 'usub'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'usub'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'usub'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'usub'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'usub'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'usub'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'usub'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'usub'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'usub'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'usub'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -989,7 +989,7 @@ declare {<64 x i8>, <64 x i1>}  @llvm.smul.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @smul(i32 %arg) {
 ; SSSE3-LABEL: 'smul'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -997,18 +997,18 @@ define i32 @smul(i32 %arg) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'smul'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -1016,18 +1016,18 @@ define i32 @smul(i32 %arg) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'smul'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -1035,18 +1035,18 @@ define i32 @smul(i32 %arg) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'smul'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -1054,18 +1054,18 @@ define i32 @smul(i32 %arg) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'smul'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -1073,18 +1073,18 @@ define i32 @smul(i32 %arg) {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'smul'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -1092,18 +1092,18 @@ define i32 @smul(i32 %arg) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'smul'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -1111,18 +1111,18 @@ define i32 @smul(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'smul'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -1130,18 +1130,18 @@ define i32 @smul(i32 %arg) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'smul'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -1149,18 +1149,18 @@ define i32 @smul(i32 %arg) {
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'smul'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -1168,11 +1168,11 @@ define i32 @smul(i32 %arg) {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -1227,190 +1227,190 @@ declare {<64 x i8>, <64 x i1>}  @llvm.umul.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @umul(i32 %arg) {
 ; SSSE3-LABEL: 'umul'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'umul'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'umul'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'umul'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'umul'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'umul'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'umul'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'umul'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'umul'
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'umul'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/costmodel.ll b/llvm/test/Analysis/CostModel/X86/costmodel.ll
index 7e5ee3bb62799..e536f341019af 100644
--- a/llvm/test/Analysis/CostModel/X86/costmodel.ll
+++ b/llvm/test/Analysis/CostModel/X86/costmodel.ll
@@ -29,7 +29,7 @@ define i64 @foo(i64 %arg) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %I2P = inttoptr i64 undef to ptr
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %P2I = ptrtoint ptr undef to i64
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %TC = trunc i64 undef to i32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void undef()
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 undef
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
index 06429a5107113..bbe3396ace8f8 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -50,22 +50,22 @@ declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1)
 
 define void @umul(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
 ; THRU-LABEL: 'umul'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'umul'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'umul'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'umul'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
index b1f7516f3c8dc..cc1d11754b27e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
@@ -188,7 +188,7 @@ define void @geps_feeding_interleave_groups_with_reuse(ptr %arg, i64 %arg1, ptr
 ; CHECK-SAME: ptr [[ARG:%.*]], i64 [[ARG1:%.*]], ptr [[ARG2:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[ARG1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 30
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 54
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
 ; CHECK:       [[VECTOR_SCEVCHECK]]:
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ARG2]], i64 8
@@ -379,7 +379,7 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N)
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[N]], 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP1]], 28
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP1]], 52
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
 ; CHECK:       [[VECTOR_SCEVCHECK]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[N]], 3
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
index a49090eecac41..eb2de69afbe58 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
@@ -38,7 +38,7 @@ define i32 @main(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
 ; CHECK-NEXT:    [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], [[UMIN1]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP4]], 32
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP4]], 40
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[CONV3]], -1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
index 743ca20f92b49..5aac001a8b9cf 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
@@ -19,7 +19,7 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo
 ; CHECK-NEXT:    [[DOTELT1:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(10) [[TMP5]], i64 0, i32 1
 ; CHECK-NEXT:    [[DOTUNPACK2:%.*]] = load i64, ptr addrspace(10) [[DOTELT1]], align 8, !tbaa [[TBAA8]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nsw i64 [[TMP2]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP8]], 28
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP8]], 60
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[TMP2]])

>From 51d4980a133db12888207698e39c469cb7055cac Mon Sep 17 00:00:00 2001
From: Mike Rice <michael.p.rice at intel.com>
Date: Thu, 25 Jul 2024 08:03:16 -0700
Subject: [PATCH 150/259] [clang][NFC] remove unneeded nullptr checks after
 dereference (#100489)

Fix static verifer concerns of null pointer checks after dereferencing
the pointer. Update the assert to make it super clear it is not null and
remove the checks.
---
 clang/lib/Sema/SemaDeclCXX.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 04b8d88cae217..1cca8ac9b9343 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -12248,16 +12248,15 @@ Decl *Sema::ActOnUsingEnumDeclaration(Scope *S, AccessSpecifier AS,
                                       SourceLocation EnumLoc, SourceRange TyLoc,
                                       const IdentifierInfo &II, ParsedType Ty,
                                       CXXScopeSpec *SS) {
-  assert(!SS->isInvalid() && "ScopeSpec is invalid");
+  assert(SS && !SS->isInvalid() && "ScopeSpec is invalid");
   TypeSourceInfo *TSI = nullptr;
   SourceLocation IdentLoc = TyLoc.getBegin();
   QualType EnumTy = GetTypeFromParser(Ty, &TSI);
   if (EnumTy.isNull()) {
-    Diag(IdentLoc, SS && isDependentScopeSpecifier(*SS)
+    Diag(IdentLoc, isDependentScopeSpecifier(*SS)
                        ? diag::err_using_enum_is_dependent
                        : diag::err_unknown_typename)
-        << II.getName()
-        << SourceRange(SS ? SS->getBeginLoc() : IdentLoc, TyLoc.getEnd());
+        << II.getName() << SourceRange(SS->getBeginLoc(), TyLoc.getEnd());
     return nullptr;
   }
 

>From abc2eae68290c453e1899a94eccc4ed5ea3b69c1 Mon Sep 17 00:00:00 2001
From: Tristan Ross <tristan.ross at midstall.com>
Date: Thu, 25 Jul 2024 08:18:14 -0700
Subject: [PATCH 151/259] [BOLT] Enable standalone build (#97130)

Continue from #87196 as author did not have much time, I have taken over
working on this PR. We would like to have this so it'll be easier to
package for Nix.

Can be tested by copying cmake, bolt, third-party, and llvm directories
out into their own directory with this PR applied and then build bolt.

---------

Co-authored-by: pca006132 <john.lck40 at gmail.com>
---
 bolt/CMakeLists.txt                           | 54 ++++++++++++++++++-
 .../include/bolt/RuntimeLibs/RuntimeLibrary.h | 11 +++-
 bolt/lib/CMakeLists.txt                       |  2 +
 bolt/lib/RuntimeLibs/HugifyRuntimeLibrary.cpp |  8 +--
 .../InstrumentationRuntimeLibrary.cpp         |  2 +-
 bolt/lib/RuntimeLibs/RuntimeLibrary.cpp       | 35 +++++++++---
 bolt/lib/Target/AArch64/CMakeLists.txt        | 12 +++++
 bolt/lib/Target/RISCV/CMakeLists.txt          | 13 +++++
 bolt/lib/Target/X86/CMakeLists.txt            | 12 +++++
 bolt/lib/Utils/CMakeLists.txt                 | 32 +++++++++--
 bolt/lib/Utils/CommandLineOpts.cpp            |  6 +--
 bolt/runtime/CMakeLists.txt                   | 22 ++++++--
 bolt/test/lit.cfg.py                          | 14 ++++-
 bolt/test/lit.site.cfg.py.in                  |  2 +
 14 files changed, 199 insertions(+), 26 deletions(-)

diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt
index 74907ad118d12..9f5875dd21284 100644
--- a/bolt/CMakeLists.txt
+++ b/bolt/CMakeLists.txt
@@ -1,6 +1,17 @@
+cmake_minimum_required(VERSION 3.20.0)
+
 set(LLVM_SUBPROJECT_TITLE "BOLT")
 
-include(ExternalProject)
+if(NOT DEFINED LLVM_COMMON_CMAKE_UTILS)
+  set(LLVM_COMMON_CMAKE_UTILS ${CMAKE_CURRENT_SOURCE_DIR}/../cmake)
+endif()
+include(${LLVM_COMMON_CMAKE_UTILS}/Modules/CMakePolicy.cmake
+  NO_POLICY_SCOPE)
+
+if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+  project(bolt)
+  set(BOLT_BUILT_STANDALONE TRUE)
+endif()
 
 set(BOLT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(BOLT_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
@@ -9,6 +20,42 @@ set(CMAKE_CXX_STANDARD 17)
 # Add path for custom modules.
 list(INSERT CMAKE_MODULE_PATH 0 "${BOLT_SOURCE_DIR}/cmake/modules")
 
+include(GNUInstallDirs)
+
+# standalone build, copied from clang
+if(BOLT_BUILT_STANDALONE)
+  set(CMAKE_CXX_STANDARD 17 CACHE STRING "C++ standard to conform to")
+  set(CMAKE_CXX_STANDARD_REQUIRED YES)
+  set(CMAKE_CXX_EXTENSIONS NO)
+
+  if(NOT MSVC_IDE)
+    set(LLVM_ENABLE_ASSERTIONS ${ENABLE_ASSERTIONS}
+      CACHE BOOL "Enable assertions")
+    # Assertions should follow llvm-config's.
+    mark_as_advanced(LLVM_ENABLE_ASSERTIONS)
+  endif()
+
+  find_package(LLVM REQUIRED HINTS "${LLVM_CMAKE_DIR}")
+  list(APPEND CMAKE_MODULE_PATH "${LLVM_DIR}")
+
+  set(LLVM_MAIN_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../llvm" CACHE PATH "Path to LLVM source tree")
+  find_program(LLVM_TABLEGEN_EXE "llvm-tblgen" ${LLVM_TOOLS_BINARY_DIR}
+    NO_DEFAULT_PATH)
+
+  # They are used as destination of target generators.
+  set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin)
+  set(LLVM_LIBRARY_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib${LLVM_LIBDIR_SUFFIX})
+
+  include(AddLLVM)
+  include(TableGen)
+  include_directories(${LLVM_INCLUDE_DIRS})
+  link_directories("${LLVM_LIBRARY_DIR}")
+
+  set( CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_INSTALL_BINDIR}" )
+  set( CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}/${LLVM_LIBDIR_SUFFIX}" )
+  set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}/${LLVM_LIBDIR_SUFFIX}")
+endif() # standalone
+
 # Determine default set of targets to build -- the intersection of
 # those BOLT supports and those LLVM is targeting.
 set(BOLT_TARGETS_TO_BUILD_all "AArch64;X86;RISCV")
@@ -94,6 +141,8 @@ if (BOLT_ENABLE_RUNTIME)
   if(CMAKE_SYSROOT)
     list(APPEND extra_args -DCMAKE_SYSROOT=${CMAKE_SYSROOT})
   endif()
+
+  include(ExternalProject)
   ExternalProject_Add(bolt_rt
     SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/runtime"
     STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-stamps
@@ -104,6 +153,7 @@ if (BOLT_ENABLE_RUNTIME)
                -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
                -DLLVM_LIBDIR_SUFFIX=${LLVM_LIBDIR_SUFFIX}
                -DLLVM_LIBRARY_DIR=${LLVM_LIBRARY_DIR}
+               -DBOLT_BUILT_STANDALONE=${BOLT_BUILT_STANDALONE}
                ${extra_args}
     INSTALL_COMMAND ""
     BUILD_ALWAYS True
@@ -113,6 +163,8 @@ if (BOLT_ENABLE_RUNTIME)
   add_llvm_install_targets(install-bolt_rt
     DEPENDS bolt_rt bolt
     COMPONENT bolt)
+  set(LIBBOLT_RT_INSTR "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib/libbolt_rt_instr.a")
+  set(LIBBOLT_RT_HUGIFY "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib/libbolt_rt_hugify.a")
 endif()
 
 find_program(GNU_LD_EXECUTABLE NAMES ${LLVM_DEFAULT_TARGET_TRIPLE}-ld.bfd ld.bfd DOC "GNU ld")
diff --git a/bolt/include/bolt/RuntimeLibs/RuntimeLibrary.h b/bolt/include/bolt/RuntimeLibs/RuntimeLibrary.h
index e392029156bce..fc1db7369eb4a 100644
--- a/bolt/include/bolt/RuntimeLibs/RuntimeLibrary.h
+++ b/bolt/include/bolt/RuntimeLibs/RuntimeLibrary.h
@@ -58,7 +58,16 @@ class RuntimeLibrary {
   uint64_t RuntimeFiniAddress{0};
   uint64_t RuntimeStartAddress{0};
 
-  /// Get the full path to a runtime library specified by \p LibFileName.
+  /// Get the full path to a runtime library specified by \p LibFileName and \p
+  /// ToolPath.
+  static std::string getLibPathByToolPath(StringRef ToolPath,
+                                          StringRef LibFileName);
+
+  /// Get the full path to a runtime library by the install directory.
+  static std::string getLibPathByInstalled(StringRef LibFileName);
+
+  /// Gets the full path to a runtime library based on whether it exists
+  /// in the install libdir or runtime libdir.
   static std::string getLibPath(StringRef ToolPath, StringRef LibFileName);
 
   /// Load a static runtime library specified by \p LibPath.
diff --git a/bolt/lib/CMakeLists.txt b/bolt/lib/CMakeLists.txt
index 22a6be44f6458..d0f921a1a1e2c 100644
--- a/bolt/lib/CMakeLists.txt
+++ b/bolt/lib/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_compile_definitions(CMAKE_INSTALL_FULL_LIBDIR="${CMAKE_INSTALL_FULL_LIBDIR}")
+
 add_subdirectory(Core)
 add_subdirectory(Passes)
 add_subdirectory(Profile)
diff --git a/bolt/lib/RuntimeLibs/HugifyRuntimeLibrary.cpp b/bolt/lib/RuntimeLibs/HugifyRuntimeLibrary.cpp
index d114d70f2d376..026f8d35c55c6 100644
--- a/bolt/lib/RuntimeLibs/HugifyRuntimeLibrary.cpp
+++ b/bolt/lib/RuntimeLibs/HugifyRuntimeLibrary.cpp
@@ -32,10 +32,10 @@ cl::opt<bool>
                     "(which is what --hot-text relies on)."),
            cl::cat(BoltOptCategory));
 
-static cl::opt<std::string> RuntimeHugifyLib(
-    "runtime-hugify-lib",
-    cl::desc("specify file name of the runtime hugify library"),
-    cl::init("libbolt_rt_hugify.a"), cl::cat(BoltOptCategory));
+static cl::opt<std::string>
+    RuntimeHugifyLib("runtime-hugify-lib",
+                     cl::desc("specify path of the runtime hugify library"),
+                     cl::init("libbolt_rt_hugify.a"), cl::cat(BoltOptCategory));
 
 } // namespace opts
 
diff --git a/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp b/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp
index cd1b975be7b90..53a0c811b41d5 100644
--- a/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp
+++ b/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp
@@ -26,7 +26,7 @@ namespace opts {
 
 cl::opt<std::string> RuntimeInstrumentationLib(
     "runtime-instrumentation-lib",
-    cl::desc("specify file name of the runtime instrumentation library"),
+    cl::desc("specify path of the runtime instrumentation library"),
     cl::init("libbolt_rt_instr.a"), cl::cat(BoltOptCategory));
 
 extern cl::opt<bool> InstrumentationFileAppendPID;
diff --git a/bolt/lib/RuntimeLibs/RuntimeLibrary.cpp b/bolt/lib/RuntimeLibs/RuntimeLibrary.cpp
index 276b034d71f96..336c6768a7f71 100644
--- a/bolt/lib/RuntimeLibs/RuntimeLibrary.cpp
+++ b/bolt/lib/RuntimeLibs/RuntimeLibrary.cpp
@@ -26,8 +26,8 @@ using namespace bolt;
 
 void RuntimeLibrary::anchor() {}
 
-std::string RuntimeLibrary::getLibPath(StringRef ToolPath,
-                                       StringRef LibFileName) {
+std::string RuntimeLibrary::getLibPathByToolPath(StringRef ToolPath,
+                                                 StringRef LibFileName) {
   StringRef Dir = llvm::sys::path::parent_path(ToolPath);
   SmallString<128> LibPath = llvm::sys::path::parent_path(Dir);
   llvm::sys::path::append(LibPath, "lib" LLVM_LIBDIR_SUFFIX);
@@ -38,13 +38,36 @@ std::string RuntimeLibrary::getLibPath(StringRef ToolPath,
     llvm::sys::path::append(LibPath, "lib" LLVM_LIBDIR_SUFFIX);
   }
   llvm::sys::path::append(LibPath, LibFileName);
-  if (!llvm::sys::fs::exists(LibPath)) {
-    errs() << "BOLT-ERROR: library not found: " << LibPath << "\n";
-    exit(1);
-  }
   return std::string(LibPath);
 }
 
+std::string RuntimeLibrary::getLibPathByInstalled(StringRef LibFileName) {
+  SmallString<128> LibPath(CMAKE_INSTALL_FULL_LIBDIR);
+  llvm::sys::path::append(LibPath, LibFileName);
+  return std::string(LibPath);
+}
+
+std::string RuntimeLibrary::getLibPath(StringRef ToolPath,
+                                       StringRef LibFileName) {
+  if (llvm::sys::fs::exists(LibFileName)) {
+    return std::string(LibFileName);
+  }
+
+  std::string ByTool = getLibPathByToolPath(ToolPath, LibFileName);
+  if (llvm::sys::fs::exists(ByTool)) {
+    return ByTool;
+  }
+
+  std::string ByInstalled = getLibPathByInstalled(LibFileName);
+  if (llvm::sys::fs::exists(ByInstalled)) {
+    return ByInstalled;
+  }
+
+  errs() << "BOLT-ERROR: library not found: " << ByTool << ", " << ByInstalled
+         << ", or " << LibFileName << "\n";
+  exit(1);
+}
+
 void RuntimeLibrary::loadLibrary(StringRef LibPath, BOLTLinker &Linker,
                                  BOLTLinker::SectionsMapper MapSections) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> MaybeBuf =
diff --git a/bolt/lib/Target/AArch64/CMakeLists.txt b/bolt/lib/Target/AArch64/CMakeLists.txt
index be03e247aa96b..7e2d33e09b5a0 100644
--- a/bolt/lib/Target/AArch64/CMakeLists.txt
+++ b/bolt/lib/Target/AArch64/CMakeLists.txt
@@ -4,6 +4,18 @@ set(LLVM_LINK_COMPONENTS
   AArch64Desc
   )
 
+if(BOLT_BUILT_STANDALONE)
+  set(LLVM_TARGET_DEFINITIONS ${LLVM_MAIN_SRC_DIR}/lib/Target/AArch64/AArch64.td)
+  list(APPEND LLVM_TABLEGEN_FLAGS -I ${LLVM_MAIN_SRC_DIR}/lib/Target/AArch64)
+  tablegen(LLVM AArch64GenInstrInfo.inc -gen-instr-info)
+  tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info)
+  tablegen(LLVM AArch64GenSystemOperands.inc -gen-searchable-tables)
+  tablegen(LLVM AArch64GenSubtargetInfo.inc -gen-subtarget)
+
+  add_public_tablegen_target(AArch64CommonTableGen)
+  include_directories(${CMAKE_CURRENT_BINARY_DIR})
+endif()
+
 add_llvm_library(LLVMBOLTTargetAArch64
   AArch64MCPlusBuilder.cpp
 
diff --git a/bolt/lib/Target/RISCV/CMakeLists.txt b/bolt/lib/Target/RISCV/CMakeLists.txt
index 7f95576063200..5d19d38717de4 100644
--- a/bolt/lib/Target/RISCV/CMakeLists.txt
+++ b/bolt/lib/Target/RISCV/CMakeLists.txt
@@ -4,6 +4,19 @@ set(LLVM_LINK_COMPONENTS
   RISCVDesc
   )
 
+if(BOLT_BUILT_STANDALONE)
+  # tablegen, copied from llvm/lib/Target/RISCV/CMakeLists.txt
+  set(LLVM_TARGET_DEFINITIONS ${LLVM_MAIN_SRC_DIR}/lib/Target/RISCV/RISCV.td)
+  list(APPEND LLVM_TABLEGEN_FLAGS -I ${LLVM_MAIN_SRC_DIR}/lib/Target/RISCV)
+  tablegen(LLVM RISCVGenInstrInfo.inc -gen-instr-info)
+  tablegen(LLVM RISCVGenRegisterInfo.inc -gen-register-info)
+  tablegen(LLVM RISCVGenSearchableTables.inc -gen-searchable-tables)
+  tablegen(LLVM RISCVGenSubtargetInfo.inc -gen-subtarget)
+
+  add_public_tablegen_target(RISCVCommonTableGen)
+  include_directories(${CMAKE_CURRENT_BINARY_DIR})
+endif()
+
 add_llvm_library(LLVMBOLTTargetRISCV
   RISCVMCPlusBuilder.cpp
 
diff --git a/bolt/lib/Target/X86/CMakeLists.txt b/bolt/lib/Target/X86/CMakeLists.txt
index 2b769bc7e7f5c..b274716e89a4c 100644
--- a/bolt/lib/Target/X86/CMakeLists.txt
+++ b/bolt/lib/Target/X86/CMakeLists.txt
@@ -5,6 +5,18 @@ set(LLVM_LINK_COMPONENTS
   X86Desc
   )
 
+if(BOLT_BUILT_STANDALONE)
+  set(LLVM_TARGET_DEFINITIONS ${LLVM_MAIN_SRC_DIR}/lib/Target/X86/X86.td)
+  list(APPEND LLVM_TABLEGEN_FLAGS -I ${LLVM_MAIN_SRC_DIR}/lib/Target/X86)
+  tablegen(LLVM X86GenInstrInfo.inc -gen-instr-info -instr-info-expand-mi-operand-info=0)
+  tablegen(LLVM X86GenMnemonicTables.inc -gen-x86-mnemonic-tables -asmwriternum=1)
+  tablegen(LLVM X86GenRegisterInfo.inc -gen-register-info)
+  tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget)
+
+  add_public_tablegen_target(X86CommonTableGen)
+  include_directories(${CMAKE_CURRENT_BINARY_DIR})
+endif()
+
 add_llvm_library(LLVMBOLTTargetX86
   X86MCPlusBuilder.cpp
   X86MCSymbolizer.cpp
diff --git a/bolt/lib/Utils/CMakeLists.txt b/bolt/lib/Utils/CMakeLists.txt
index d1403314274bd..c452c1fac3772 100644
--- a/bolt/lib/Utils/CMakeLists.txt
+++ b/bolt/lib/Utils/CMakeLists.txt
@@ -1,15 +1,39 @@
+find_first_existing_vc_file("${LLVM_MAIN_SRC_DIR}" llvm_vc)
+find_first_existing_vc_file("${BOLT_SOURCE_DIR}" bolt_vc)
+
+# The VC revision include that we want to generate.
+set(version_inc "${CMAKE_CURRENT_BINARY_DIR}/VCSVersion.inc")
+
+set(generate_vcs_version_script "${LLVM_CMAKE_DIR}/GenerateVersionFromVCS.cmake")
+
+# Create custom target to generate the VC revision include.
+add_custom_command(OUTPUT "${version_inc}"
+  DEPENDS "${llvm_vc}" "${bolt_vc}" "${generate_vcs_version_script}"
+  COMMAND ${CMAKE_COMMAND} "-DNAMES=BOLT"
+                           "-DHEADER_FILE=${version_inc}"
+                           "-DBOLT_SOURCE_DIR=${BOLT_SOURCE_DIR}"
+                           "-DLLVM_VC_REPOSITORY=${llvm_vc_repository}"
+                           "-DLLVM_VC_REVISION=${llvm_vc_revision}"
+                           "-DLLVM_FORCE_VC_REVISION=${LLVM_FORCE_VC_REVISION}"
+                           "-DLLVM_FORCE_VC_REPOSITORY=${LLVM_FORCE_VC_REPOSITORY}"
+                           -P "${generate_vcs_version_script}")
+
+# Mark the generated header as being generated.
+set_source_files_properties("${version_inc}"
+  PROPERTIES GENERATED TRUE
+             HEADER_FILE_ONLY TRUE)
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
 add_llvm_library(LLVMBOLTUtils
   CommandLineOpts.cpp
   Utils.cpp
-
+  ${version_inc}
   DISABLE_LLVM_LINK_LLVM_DYLIB
 
   LINK_LIBS
   ${LLVM_PTHREAD_LIB}
 
-  DEPENDS
-  llvm_vcsrevision_h
-
   LINK_COMPONENTS
   Support
   )
diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp
index 47375abb2ad3b..435a8fa9cafca 100644
--- a/bolt/lib/Utils/CommandLineOpts.cpp
+++ b/bolt/lib/Utils/CommandLineOpts.cpp
@@ -11,15 +11,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "bolt/Utils/CommandLineOpts.h"
-#include "llvm/Support/VCSRevision.h"
+#include "VCSVersion.inc"
 
 using namespace llvm;
 
 namespace llvm {
 namespace bolt {
 const char *BoltRevision =
-#ifdef LLVM_REVISION
-    LLVM_REVISION;
+#ifdef BOLT_REVISION
+    BOLT_REVISION;
 #else
     "<unknown>";
 #endif
diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt
index 6a65f80fb9079..948825ddc27eb 100644
--- a/bolt/runtime/CMakeLists.txt
+++ b/bolt/runtime/CMakeLists.txt
@@ -16,12 +16,19 @@ add_library(bolt_rt_instr STATIC
   instr.cpp
   ${CMAKE_CURRENT_BINARY_DIR}/config.h
   )
-set_target_properties(bolt_rt_instr PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${LLVM_LIBRARY_DIR}")
+set_target_properties(bolt_rt_instr PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
 add_library(bolt_rt_hugify STATIC
   hugify.cpp
   ${CMAKE_CURRENT_BINARY_DIR}/config.h
   )
-set_target_properties(bolt_rt_hugify PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${LLVM_LIBRARY_DIR}")
+set_target_properties(bolt_rt_hugify PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+
+if(NOT BOLT_BUILT_STANDALONE)
+  add_custom_command(TARGET bolt_rt_instr POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_instr.a" "${LLVM_LIBRARY_DIR}")
+  add_custom_command(TARGET bolt_rt_hugify POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_hugify.a" "${LLVM_LIBRARY_DIR}")
+endif()
 
 set(BOLT_RT_FLAGS
   -ffreestanding
@@ -46,8 +53,8 @@ target_include_directories(bolt_rt_instr PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 target_compile_options(bolt_rt_hugify PRIVATE ${BOLT_RT_FLAGS})
 target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 
-install(TARGETS bolt_rt_instr DESTINATION "lib${LLVM_LIBDIR_SUFFIX}")
-install(TARGETS bolt_rt_hugify DESTINATION "lib${LLVM_LIBDIR_SUFFIX}")
+install(TARGETS bolt_rt_instr DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+install(TARGETS bolt_rt_hugify DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
 
 if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
   add_library(bolt_rt_instr_osx STATIC
@@ -59,5 +66,10 @@ if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Da
   target_compile_options(bolt_rt_instr_osx PRIVATE
     -target x86_64-apple-darwin19.6.0
     ${BOLT_RT_FLAGS})
-  install(TARGETS bolt_rt_instr_osx DESTINATION "lib${LLVM_LIBDIR_SUFFIX}")
+  install(TARGETS bolt_rt_instr_osx DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+
+  if(NOT BOLT_BUILT_STANDALONE)
+    add_custom_command(TARGET bolt_rt_instr_osx POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_instr_osx.a" "${LLVM_LIBRARY_DIR}")
+  endif()
 endif()
diff --git a/bolt/test/lit.cfg.py b/bolt/test/lit.cfg.py
index 3a6da210e01f0..da3ae34ba3bdd 100644
--- a/bolt/test/lit.cfg.py
+++ b/bolt/test/lit.cfg.py
@@ -92,10 +92,22 @@
 
 tool_dirs = [config.llvm_tools_dir, config.test_source_root]
 
+llvm_bolt_args = []
+
+if config.libbolt_rt_instr:
+    llvm_bolt_args.append(f"--runtime-instrumentation-lib={config.libbolt_rt_instr}")
+
+if config.libbolt_rt_hugify:
+    llvm_bolt_args.append(f"--runtime-hugify-lib={config.libbolt_rt_hugify}")
+
 tools = [
     ToolSubst("llc", unresolved="fatal"),
     ToolSubst("llvm-dwarfdump", unresolved="fatal"),
-    ToolSubst("llvm-bolt", unresolved="fatal"),
+    ToolSubst(
+        "llvm-bolt",
+        unresolved="fatal",
+        extra_args=llvm_bolt_args,
+    ),
     ToolSubst("llvm-boltdiff", unresolved="fatal"),
     ToolSubst("llvm-bolt-heatmap", unresolved="fatal"),
     ToolSubst("llvm-bat-dump", unresolved="fatal"),
diff --git a/bolt/test/lit.site.cfg.py.in b/bolt/test/lit.site.cfg.py.in
index 46cb326dfbae1..457908fc7c446 100644
--- a/bolt/test/lit.site.cfg.py.in
+++ b/bolt/test/lit.site.cfg.py.in
@@ -19,6 +19,8 @@ config.bolt_clang = "@BOLT_CLANG_EXE@"
 config.bolt_lld = "@BOLT_LLD_EXE@"
 config.targets_to_build = "@BOLT_TARGETS_TO_BUILD@"
 config.gnu_ld = "@GNU_LD_EXECUTABLE@"
+config.libbolt_rt_instr = "@LIBBOLT_RT_INSTR@"
+config.libbolt_rt_hugify = "@LIBBOLT_RT_HUGIFY@"
 
 import lit.llvm
 lit.llvm.initialize(lit_config, config)

>From ffd624024865cf98644c4f9357ef369327ac0995 Mon Sep 17 00:00:00 2001
From: Tristan Ross <tristan.ross at midstall.com>
Date: Thu, 25 Jul 2024 08:20:57 -0700
Subject: [PATCH 152/259] [BOLT] Update Docker to use Ubuntu 24.04 (#99421)

Updates the Dockerfile to use Ubuntu 24.04 due to CMake wanting a newer
version. Can be tested by trying to build the Docker image currently in
main and then try building the Docker image in this PR.
---
 bolt/utils/docker/Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bolt/utils/docker/Dockerfile b/bolt/utils/docker/Dockerfile
index 722a07e46f9e4..c2108f7aec53c 100644
--- a/bolt/utils/docker/Dockerfile
+++ b/bolt/utils/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:20.04 AS builder
+FROM ubuntu:24.04 AS builder
 
 ARG DEBIAN_FRONTEND=noninteractive
 ENV TZ=UTC
@@ -26,6 +26,6 @@ RUN mkdir build && \
     ninja install-llvm-bolt install-perf2bolt install-merge-fdata \
       install-llvm-boltdiff install-bolt_rt
 
-FROM ubuntu:20.04
+FROM ubuntu:24.04
 
 COPY --from=builder /home/bolt/install /usr/local

>From 0762db6533eda3453158c7b9b0631542c47093a8 Mon Sep 17 00:00:00 2001
From: Nathan James <n.james93 at hotmail.co.uk>
Date: Thu, 25 Jul 2024 16:25:37 +0100
Subject: [PATCH 153/259] [clang-tidy] Fix crash in modernize-use-ranges
 (#100427)

Crash seems to be caused by the check function not handling inline
namespaces correctly for some instances. Changed how the Replacer is got
from the MatchResult now which should alleviate any potential issues

Fixes #100406
---
 .../clang-tidy/utils/UseRangesCheck.cpp       | 64 +++++++++----------
 .../clang-tidy/utils/UseRangesCheck.h         |  2 +-
 .../modernize/Inputs/use-ranges/fake_std.h    | 17 +++--
 3 files changed, 43 insertions(+), 40 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp
index e2daa5010e2ae..aba4d17ccd035 100644
--- a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp
@@ -39,12 +39,6 @@ static constexpr const char ArgName[] = "ArgName";
 
 namespace clang::tidy::utils {
 
-static bool operator==(const UseRangesCheck::Indexes &L,
-                       const UseRangesCheck::Indexes &R) {
-  return std::tie(L.BeginArg, L.EndArg, L.ReplaceArg) ==
-         std::tie(R.BeginArg, R.EndArg, R.ReplaceArg);
-}
-
 static std::string getFullPrefix(ArrayRef<UseRangesCheck::Indexes> Signature) {
   std::string Output;
   llvm::raw_string_ostream OS(Output);
@@ -54,15 +48,6 @@ static std::string getFullPrefix(ArrayRef<UseRangesCheck::Indexes> Signature) {
   return Output;
 }
 
-static llvm::hash_code hash_value(const UseRangesCheck::Indexes &Indexes) {
-  return llvm::hash_combine(Indexes.BeginArg, Indexes.EndArg,
-                            Indexes.ReplaceArg);
-}
-
-static llvm::hash_code hash_value(const UseRangesCheck::Signature &Sig) {
-  return llvm::hash_combine_range(Sig.begin(), Sig.end());
-}
-
 namespace {
 
 AST_MATCHER(Expr, hasSideEffects) {
@@ -123,24 +108,26 @@ makeMatcherPair(StringRef State, const UseRangesCheck::Indexes &Indexes,
 }
 
 void UseRangesCheck::registerMatchers(MatchFinder *Finder) {
-  Replaces = getReplacerMap();
+  auto Replaces = getReplacerMap();
   ReverseDescriptor = getReverseDescriptor();
   auto BeginEndNames = getFreeBeginEndMethods();
   llvm::SmallVector<StringRef, 4> BeginNames{
       llvm::make_first_range(BeginEndNames)};
   llvm::SmallVector<StringRef, 4> EndNames{
       llvm::make_second_range(BeginEndNames)};
-  llvm::DenseSet<ArrayRef<Signature>> Seen;
+  Replacers.clear();
+  llvm::DenseSet<Replacer *> SeenRepl;
   for (auto I = Replaces.begin(), E = Replaces.end(); I != E; ++I) {
-    const ArrayRef<Signature> &Signatures =
-        I->getValue()->getReplacementSignatures();
-    if (!Seen.insert(Signatures).second)
+    auto Replacer = I->getValue();
+    if (!SeenRepl.insert(Replacer.get()).second)
       continue;
-    assert(!Signatures.empty() &&
-           llvm::all_of(Signatures, [](auto Index) { return !Index.empty(); }));
+    Replacers.push_back(Replacer);
+    assert(!Replacer->getReplacementSignatures().empty() &&
+           llvm::all_of(Replacer->getReplacementSignatures(),
+                        [](auto Index) { return !Index.empty(); }));
     std::vector<StringRef> Names(1, I->getKey());
     for (auto J = std::next(I); J != E; ++J)
-      if (J->getValue()->getReplacementSignatures() == Signatures)
+      if (J->getValue() == Replacer)
         Names.push_back(J->getKey());
 
     std::vector<ast_matchers::internal::DynTypedMatcher> TotalMatchers;
@@ -148,7 +135,7 @@ void UseRangesCheck::registerMatchers(MatchFinder *Finder) {
     // signatures in order of length(longest to shortest). This way any
     // signature that is a subset of another signature will be matched after the
     // other.
-    SmallVector<Signature> SigVec(Signatures);
+    SmallVector<Signature> SigVec(Replacer->getReplacementSignatures());
     llvm::sort(SigVec, [](auto &L, auto &R) { return R.size() < L.size(); });
     for (const auto &Signature : SigVec) {
       std::vector<ast_matchers::internal::DynTypedMatcher> Matchers;
@@ -163,7 +150,8 @@ void UseRangesCheck::registerMatchers(MatchFinder *Finder) {
     }
     Finder->addMatcher(
         callExpr(
-            callee(functionDecl(hasAnyName(std::move(Names))).bind(FuncDecl)),
+            callee(functionDecl(hasAnyName(std::move(Names)))
+                       .bind((FuncDecl + Twine(Replacers.size() - 1).str()))),
             ast_matchers::internal::DynTypedMatcher::constructVariadic(
                 ast_matchers::internal::DynTypedMatcher::VO_AnyOf,
                 ASTNodeKind::getFromNodeKind<CallExpr>(),
@@ -205,21 +193,33 @@ static void removeFunctionArgs(DiagnosticBuilder &Diag, const CallExpr &Call,
 }
 
 void UseRangesCheck::check(const MatchFinder::MatchResult &Result) {
-  const auto *Function = Result.Nodes.getNodeAs<FunctionDecl>(FuncDecl);
-  std::string Qualified = "::" + Function->getQualifiedNameAsString();
-  auto Iter = Replaces.find(Qualified);
-  assert(Iter != Replaces.end());
+  Replacer *Replacer = nullptr;
+  const FunctionDecl *Function = nullptr;
+  for (auto [Node, Value] : Result.Nodes.getMap()) {
+    StringRef NodeStr(Node);
+    if (!NodeStr.consume_front(FuncDecl))
+      continue;
+    Function = Value.get<FunctionDecl>();
+    size_t Index;
+    if (NodeStr.getAsInteger(10, Index)) {
+      llvm_unreachable("Unable to extract replacer index");
+    }
+    assert(Index < Replacers.size());
+    Replacer = Replacers[Index].get();
+    break;
+  }
+  assert(Replacer && Function);
   SmallString<64> Buffer;
-  for (const Signature &Sig : Iter->getValue()->getReplacementSignatures()) {
+  for (const Signature &Sig : Replacer->getReplacementSignatures()) {
     Buffer.assign({BoundCall, getFullPrefix(Sig)});
     const auto *Call = Result.Nodes.getNodeAs<CallExpr>(Buffer);
     if (!Call)
       continue;
     auto Diag = createDiag(*Call);
-    if (auto ReplaceName = Iter->getValue()->getReplaceName(*Function))
+    if (auto ReplaceName = Replacer->getReplaceName(*Function))
       Diag << FixItHint::CreateReplacement(Call->getCallee()->getSourceRange(),
                                            *ReplaceName);
-    if (auto Include = Iter->getValue()->getHeaderInclusion(*Function))
+    if (auto Include = Replacer->getHeaderInclusion(*Function))
       Diag << Inserter.createIncludeInsertion(
           Result.SourceManager->getFileID(Call->getBeginLoc()), *Include);
     llvm::SmallVector<unsigned, 3> ToRemove;
diff --git a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h
index 927e9694b0ec7..3a454bcf0cf07 100644
--- a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h
+++ b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h
@@ -85,7 +85,7 @@ class UseRangesCheck : public ClangTidyCheck {
   std::optional<TraversalKind> getCheckTraversalKind() const override;
 
 private:
-  ReplacerMap Replaces;
+  std::vector<llvm::IntrusiveRefCntPtr<Replacer>> Replacers;
   std::optional<ReverseIteratorDescriptor> ReverseDescriptor;
   IncludeInserter Inserter;
 };
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/use-ranges/fake_std.h b/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/use-ranges/fake_std.h
index 6596511c7a38b..69ac9954f4afa 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/use-ranges/fake_std.h
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/use-ranges/fake_std.h
@@ -7,8 +7,8 @@ template <typename T> class vector {
 public:
   using iterator = T *;
   using const_iterator = const T *;
-  using reverse_iterator = T*;
-  using reverse_const_iterator = const T*;
+  using reverse_iterator = T *;
+  using reverse_const_iterator = const T *;
 
   constexpr const_iterator begin() const;
   constexpr const_iterator end() const;
@@ -72,8 +72,8 @@ template <typename Container> constexpr auto crend(const Container &Cont) {
   return Cont.crend();
 }
 // Find
-template< class InputIt, class T >
-InputIt find( InputIt first, InputIt last, const T& value );
+template <class InputIt, class T>
+InputIt find(InputIt first, InputIt last, const T &value);
 
 // Reverse
 template <typename Iter> void reverse(Iter begin, Iter end);
@@ -82,6 +82,7 @@ template <typename Iter> void reverse(Iter begin, Iter end);
 template <class InputIt1, class InputIt2>
 bool includes(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2);
 
+inline namespace _V1 {
 // IsPermutation
 template <class ForwardIt1, class ForwardIt2>
 bool is_permutation(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2);
@@ -97,9 +98,10 @@ template <class InputIt1, class InputIt2>
 bool equal(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2);
 
 template <class InputIt1, class InputIt2, class BinaryPred>
-bool equal(InputIt1 first1, InputIt1 last1,
-           InputIt2 first2, InputIt2 last2, BinaryPred p) {
-  // Need a definition to suppress undefined_internal_type when invoked with lambda
+bool equal(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
+           BinaryPred p) {
+  // Need a definition to suppress undefined_internal_type when invoked with
+  // lambda
   return true;
 }
 
@@ -108,6 +110,7 @@ void iota(ForwardIt first, ForwardIt last, T value);
 
 template <class ForwardIt>
 ForwardIt rotate(ForwardIt first, ForwardIt middle, ForwardIt last);
+} // namespace _V1
 
 } // namespace std
 

>From a27f816fe56af9cc7f4f296ad6c577f6ea64349f Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me at piotrzegar.pl>
Date: Thu, 25 Jul 2024 17:26:01 +0200
Subject: [PATCH 154/259] [clang-tidy] Fix crash in C language in
 readability-non-const-parameter (#100461)

Fix crash that happen when redeclaration got
different number of parameters than definition.

Fixes #100340
---
 .../clang-tidy/readability/NonConstParameterCheck.cpp |  5 ++++-
 .../checkers/readability/non-const-parameter.c        | 11 +++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/readability/non-const-parameter.c

diff --git a/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp b/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp
index 95a3a5165e2e8..43b69a24bdb16 100644
--- a/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp
@@ -157,9 +157,12 @@ void NonConstParameterCheck::diagnoseNonConstParameters() {
     if (!Function)
       continue;
     unsigned Index = Par->getFunctionScopeIndex();
-    for (FunctionDecl *FnDecl : Function->redecls())
+    for (FunctionDecl *FnDecl : Function->redecls()) {
+      if (FnDecl->getNumParams() <= Index)
+        continue;
       Fixes.push_back(FixItHint::CreateInsertion(
           FnDecl->getParamDecl(Index)->getBeginLoc(), "const "));
+    }
 
     diag(Par->getLocation(), "pointer parameter '%0' can be pointer to const")
         << Par->getName() << Fixes;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/non-const-parameter.c b/clang-tools-extra/test/clang-tidy/checkers/readability/non-const-parameter.c
new file mode 100644
index 0000000000000..db50467f3dd94
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/non-const-parameter.c
@@ -0,0 +1,11 @@
+// RUN: %check_clang_tidy %s readability-non-const-parameter %t
+
+static int f();
+
+int f(p)
+  int *p;
+// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: pointer parameter 'p' can be pointer to const [readability-non-const-parameter]
+// CHECK-FIXES: {{^}}  const int *p;{{$}}
+{
+    return *p;
+}

>From 24a18aafa337b70a5a6c30f077bddc64e8677b48 Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev at amd.com>
Date: Thu, 25 Jul 2024 18:44:44 +0300
Subject: [PATCH 155/259] [AMDGPU] Simplify printing row/bank_mask modifiers.
 (#100575)

And fix a codegen test to use mask values that fit their encoding
fields.

Part of <https://github.com/llvm/llvm-project/issues/62629>.
---
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 12 ++--------
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h   |  2 --
 .../CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll  | 24 +++++++++----------
 3 files changed, 14 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 983f3c430f7be..fc7aba3483aad 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -47,12 +47,6 @@ void AMDGPUInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   printAnnotation(OS, Annot);
 }
 
-void AMDGPUInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo,
-                                          const MCSubtargetInfo &STI,
-                                          raw_ostream &O) {
-  O << formatHex(MI->getOperand(OpNo).getImm() & 0xf);
-}
-
 void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
@@ -1149,15 +1143,13 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
 void AMDGPUInstPrinter::printDppRowMask(const MCInst *MI, unsigned OpNo,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
-  O << " row_mask:";
-  printU4ImmOperand(MI, OpNo, STI, O);
+  O << " row_mask:" << formatHex(MI->getOperand(OpNo).getImm());
 }
 
 void AMDGPUInstPrinter::printDppBankMask(const MCInst *MI, unsigned OpNo,
                                          const MCSubtargetInfo &STI,
                                          raw_ostream &O) {
-  O << " bank_mask:";
-  printU4ImmOperand(MI, OpNo, STI, O);
+  O << " bank_mask:" << formatHex(MI->getOperand(OpNo).getImm());
 }
 
 void AMDGPUInstPrinter::printDppBoundCtrl(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 3236816f69bc5..c5fad384348df 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -36,8 +36,6 @@ class AMDGPUInstPrinter : public MCInstPrinter {
                               const MCRegisterInfo &MRI);
 
 private:
-  void printU4ImmOperand(const MCInst *MI, unsigned OpNo,
-                         const MCSubtargetInfo &STI, raw_ostream &O);
   void printU16ImmOperand(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
   void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index b678378e55545..a9a710f9b6723 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -287,7 +287,7 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb4(ptr addrspace(1) %out, float %
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_f32_imm_comb5(ptr addrspace(1) %out, float %in1, float %in2) {
-  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 62, i32 61, i1 true)
+  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 14, i32 13, i1 true)
   store float %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -300,7 +300,7 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb5(ptr addrspace(1) %out, float %
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_f32_imm_comb6(ptr addrspace(1) %out, float %in1, float %in2) {
-  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 63, i32 63, i1 true)
+  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 15, i32 15, i1 true)
   store float %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -314,7 +314,7 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb6(ptr addrspace(1) %out, float %
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_f32_imm_comb7(ptr addrspace(1) %out, float %in1, float %in2) {
-  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 64, i32 64, i32 64, i1 true)
+  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 64, i32 0, i32 0, i1 true)
   store float %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -327,7 +327,7 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb7(ptr addrspace(1) %out, float %
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_f32_imm_comb8(ptr addrspace(1) %out, float %in1, float %in2) {
-  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 31, i32 63, i32 128, i1 true)
+  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 31, i32 15, i32 0, i1 true)
   store float %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -405,7 +405,7 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb4(ptr addrspace(1) %out, <2 x
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_v2i16_imm_comb5(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
-  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 62, i32 61, i1 true)
+  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 14, i32 13, i1 true)
   store <2 x i16> %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -418,7 +418,7 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb5(ptr addrspace(1) %out, <2 x
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_v2i16_imm_comb6(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
-  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 63, i32 63, i1 true)
+  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 15, i32 15, i1 true)
   store <2 x i16> %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -431,7 +431,7 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb6(ptr addrspace(1) %out, <2 x
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_v2i16_imm_comb7(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
-  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 64, i32 64, i32 64, i1 true)
+  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 64, i32 0, i32 0, i1 true)
   store <2 x i16> %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -444,7 +444,7 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb7(ptr addrspace(1) %out, <2 x
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_v2i16_imm_comb8(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
-  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 31, i32 63, i32 128, i1 true)
+  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 31, i32 15, i32 0, i1 true)
   store <2 x i16> %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -522,7 +522,7 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb4(ptr addrspace(1) %out, <2 x
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_v2f16_imm_comb5(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
-  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 62, i32 61, i1 true)
+  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 14, i32 13, i1 true)
   store <2 x half> %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -535,7 +535,7 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb5(ptr addrspace(1) %out, <2 x
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_v2f16_imm_comb6(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
-  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 63, i32 63, i1 true)
+  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 15, i32 15, i1 true)
   store <2 x half> %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -548,7 +548,7 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb6(ptr addrspace(1) %out, <2 x
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_v2f16_imm_comb7(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
-  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 64, i32 64, i32 64, i1 true)
+  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 64, i32 0, i32 0, i1 true)
   store <2 x half> %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -561,7 +561,7 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb7(ptr addrspace(1) %out, <2 x
 ; GFX8-NOOPT: s_nop 1
 ; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}}
 define amdgpu_kernel void @dpp_test_v2f16_imm_comb8(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
-  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 31, i32 63, i32 128, i1 true)
+  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 31, i32 15, i32 0, i1 true)
   store <2 x half> %tmp0, ptr addrspace(1) %out
   ret void
 }

>From 98e733eaf2af1a5c1d9392e279d21182ffdf560d Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Thu, 25 Jul 2024 16:53:27 +0100
Subject: [PATCH 156/259] [flang][OpenMP] Initialize privatised derived type
 variables (#100417)

Fixes #91928
---
 flang/include/flang/Lower/ConvertVariable.h   |  8 ++++
 flang/lib/Lower/ConvertVariable.cpp           | 23 ++++-----
 .../lib/Lower/OpenMP/DataSharingProcessor.cpp |  6 +++
 .../Lower/OpenMP/private-derived-type.f90     | 47 +++++++++++++++++++
 4 files changed, 73 insertions(+), 11 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/private-derived-type.f90

diff --git a/flang/include/flang/Lower/ConvertVariable.h b/flang/include/flang/Lower/ConvertVariable.h
index 515f4695951b4..de394a39e112e 100644
--- a/flang/include/flang/Lower/ConvertVariable.h
+++ b/flang/include/flang/Lower/ConvertVariable.h
@@ -62,6 +62,14 @@ using AggregateStoreMap = llvm::DenseMap<AggregateStoreKey, mlir::Value>;
 void instantiateVariable(AbstractConverter &, const pft::Variable &var,
                          SymMap &symMap, AggregateStoreMap &storeMap);
 
+/// Does this variable have a default initialization?
+bool hasDefaultInitialization(const Fortran::semantics::Symbol &sym);
+
+/// Call default initialization runtime routine to initialize \p var.
+void defaultInitializeAtRuntime(Fortran::lower::AbstractConverter &converter,
+                                const Fortran::semantics::Symbol &sym,
+                                Fortran::lower::SymMap &symMap);
+
 /// Create a fir::GlobalOp given a module variable definition. This is intended
 /// to be used when lowering a module definition, not when lowering variables
 /// used from a module. For used variables instantiateVariable must directly be
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 47ad48fb322cc..4fcfa0b126e04 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -72,7 +72,8 @@ static mlir::Value genScalarValue(Fortran::lower::AbstractConverter &converter,
 }
 
 /// Does this variable have a default initialization?
-static bool hasDefaultInitialization(const Fortran::semantics::Symbol &sym) {
+bool Fortran::lower::hasDefaultInitialization(
+    const Fortran::semantics::Symbol &sym) {
   if (sym.has<Fortran::semantics::ObjectEntityDetails>() && sym.size())
     if (!Fortran::semantics::IsAllocatableOrPointer(sym))
       if (const Fortran::semantics::DeclTypeSpec *declTypeSpec = sym.GetType())
@@ -353,7 +354,7 @@ static mlir::Value genComponentDefaultInit(
       // global constructor since this has no runtime cost.
       componentValue = fir::factory::createUnallocatedBox(
           builder, loc, componentTy, std::nullopt);
-    } else if (hasDefaultInitialization(component)) {
+    } else if (Fortran::lower::hasDefaultInitialization(component)) {
       // Component type has default initialization.
       componentValue = genDefaultInitializerValue(converter, loc, component,
                                                   componentTy, stmtCtx);
@@ -556,7 +557,7 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
                 builder.createConvert(loc, symTy, fir::getBase(initVal));
             builder.create<fir::HasValueOp>(loc, castTo);
           });
-    } else if (hasDefaultInitialization(sym)) {
+    } else if (Fortran::lower::hasDefaultInitialization(sym)) {
       Fortran::lower::createGlobalInitialization(
           builder, global, [&](fir::FirOpBuilder &builder) {
             Fortran::lower::StatementContext stmtCtx(
@@ -752,17 +753,15 @@ mustBeDefaultInitializedAtRuntime(const Fortran::lower::pft::Variable &var) {
     return true;
   // Local variables (including function results), and intent(out) dummies must
   // be default initialized at runtime if their type has default initialization.
-  return hasDefaultInitialization(sym);
+  return Fortran::lower::hasDefaultInitialization(sym);
 }
 
 /// Call default initialization runtime routine to initialize \p var.
-static void
-defaultInitializeAtRuntime(Fortran::lower::AbstractConverter &converter,
-                           const Fortran::lower::pft::Variable &var,
-                           Fortran::lower::SymMap &symMap) {
+void Fortran::lower::defaultInitializeAtRuntime(
+    Fortran::lower::AbstractConverter &converter,
+    const Fortran::semantics::Symbol &sym, Fortran::lower::SymMap &symMap) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   mlir::Location loc = converter.getCurrentLocation();
-  const Fortran::semantics::Symbol &sym = var.getSymbol();
   fir::ExtendedValue exv = converter.getSymbolExtendedValue(sym, &symMap);
   if (Fortran::semantics::IsOptional(sym)) {
     // 15.5.2.12 point 3, absent optional dummies are not initialized.
@@ -927,7 +926,8 @@ static void instantiateLocal(Fortran::lower::AbstractConverter &converter,
   if (needDummyIntentoutFinalization(var))
     finalizeAtRuntime(converter, var, symMap);
   if (mustBeDefaultInitializedAtRuntime(var))
-    defaultInitializeAtRuntime(converter, var, symMap);
+    Fortran::lower::defaultInitializeAtRuntime(converter, var.getSymbol(),
+                                               symMap);
   if (Fortran::semantics::NeedCUDAAlloc(var.getSymbol())) {
     auto *builder = &converter.getFirOpBuilder();
     mlir::Location loc = converter.getCurrentLocation();
@@ -1168,7 +1168,8 @@ static void instantiateAlias(Fortran::lower::AbstractConverter &converter,
   // do not try optimizing this to single default initializations of
   // the equivalenced storages. Keep lowering simple.
   if (mustBeDefaultInitializedAtRuntime(var))
-    defaultInitializeAtRuntime(converter, var, symMap);
+    Fortran::lower::defaultInitializeAtRuntime(converter, var.getSymbol(),
+                                               symMap);
 }
 
 //===--------------------------------------------------------------===//
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 7e76a81e0df92..a340b62eb7b66 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -13,6 +13,7 @@
 #include "DataSharingProcessor.h"
 
 #include "Utils.h"
+#include "flang/Lower/ConvertVariable.h"
 #include "flang/Lower/PFTBuilder.h"
 #include "flang/Lower/SymbolMap.h"
 #include "flang/Optimizer/Builder/HLFIRTools.h"
@@ -117,6 +118,11 @@ void DataSharingProcessor::cloneSymbol(const semantics::Symbol *sym) {
   bool success = converter.createHostAssociateVarClone(*sym);
   (void)success;
   assert(success && "Privatization failed due to existing binding");
+
+  bool isFirstPrivate = sym->test(semantics::Symbol::Flag::OmpFirstPrivate);
+  if (!isFirstPrivate &&
+      Fortran::lower::hasDefaultInitialization(sym->GetUltimate()))
+    Fortran::lower::defaultInitializeAtRuntime(converter, *sym, *symTable);
 }
 
 void DataSharingProcessor::copyFirstPrivateSymbol(
diff --git a/flang/test/Lower/OpenMP/private-derived-type.f90 b/flang/test/Lower/OpenMP/private-derived-type.f90
new file mode 100644
index 0000000000000..230484f20c11d
--- /dev/null
+++ b/flang/test/Lower/OpenMP/private-derived-type.f90
@@ -0,0 +1,47 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp -o - %s | FileCheck %s
+
+subroutine s4
+  type y3
+    integer,allocatable::x
+  end type y3
+  type(y3)::v
+  !$omp parallel
+  !$omp do private(v)
+  do i=1,10
+    v%x=1
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine s4
+
+
+! CHECK-LABEL:   func.func @_QPs4() {
+!                  Example of how the lowering for regular derived type variables:
+! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}> {bindc_name = "v", uniq_name = "_QFs4Ev"}
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFs4Ev"} : (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>, !fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>)
+! CHECK:           %[[VAL_10:.*]] = fir.embox %[[VAL_9]]#1 : (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>
+! CHECK:           %[[VAL_11:.*]] = fir.address_of
+! CHECK:           %[[VAL_12:.*]] = arith.constant 4 : i32
+! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_10]] : (!fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<none>
+! CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_11]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
+! CHECK:           %[[VAL_15:.*]] = fir.call @_FortranAInitialize(%[[VAL_13]], %[[VAL_14]], %[[VAL_12]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_23:.*]] = fir.alloca !fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}> {bindc_name = "v", pinned, uniq_name = "_QFs4Ev"}
+! CHECK:             %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFs4Ev"} : (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>, !fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>)
+! CHECK:             %[[VAL_25:.*]] = fir.embox %[[VAL_24]]#1 : (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>
+! CHECK:             %[[VAL_26:.*]] = fir.address_of
+! CHECK:             %[[VAL_27:.*]] = arith.constant 4 : i32
+! CHECK:             %[[VAL_28:.*]] = fir.convert %[[VAL_25]] : (!fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<none>
+! CHECK:             %[[VAL_29:.*]] = fir.convert %[[VAL_26]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
+!                    Check we do call FortranAInitialize on the derived type
+! CHECK:             %[[VAL_30:.*]] = fir.call @_FortranAInitialize(%[[VAL_28]], %[[VAL_29]], %[[VAL_27]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:             omp.wsloop {
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           %[[VAL_39:.*]] = fir.embox %[[VAL_9]]#1 : (!fir.ref<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>
+! CHECK:           %[[VAL_40:.*]] = fir.convert %[[VAL_39]] : (!fir.box<!fir.type<_QFs4Ty3{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<none>
+!                  Check the derived type is destroyed
+! CHECK:           %[[VAL_41:.*]] = fir.call @_FortranADestroy(%[[VAL_40]]) fastmath<contract> : (!fir.box<none>) -> none
+! CHECK:           return
+! CHECK:         }

>From abacc5220e228012b0b86ca2ed540ef1a715f1ee Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 25 Jul 2024 16:54:46 +0100
Subject: [PATCH 157/259] Fix unused variable warning. NFC.

---
 llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index ed78201101ab3..4323635ff73f0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2678,7 +2678,7 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
   if (DestTy == Src->getType())
     return replaceInstUsesWith(CI, Src);
 
-  if (FixedVectorType *DestVTy = dyn_cast<FixedVectorType>(DestTy)) {
+  if (isa<FixedVectorType>(DestTy)) {
     if (isa<IntegerType>(SrcTy)) {
       // If this is a cast from an integer to vector, check to see if the input
       // is a trunc or zext of a bitcast from vector.  If so, we can replace all

>From 9398cc2ec57721a57af98f6206c816038375ac6e Mon Sep 17 00:00:00 2001
From: Acim Maravic <Acim.Maravic at Syrmia.com>
Date: Thu, 25 Jul 2024 18:01:07 +0200
Subject: [PATCH 158/259] [LLVM][AMDGPU] Copy isConvergent from Pseudo to Real
 instructions (#99658)

This patch copies the flag isConvergent from pseudo instructions to the
corresponding real instructions, so that isConvergent flag is also
defined for real instructions.

Flags are not required by the compiler, but for consistency it would be
nice to have them.

Co-authored-by: Acim Maravic <Acim.Maravic at amd.com>
---
 llvm/lib/Target/AMDGPU/BUFInstructions.td  | 3 +++
 llvm/lib/Target/AMDGPU/DSInstructions.td   | 1 +
 llvm/lib/Target/AMDGPU/FLATInstructions.td | 2 ++
 llvm/lib/Target/AMDGPU/SMInstructions.td   | 1 +
 llvm/lib/Target/AMDGPU/SOPInstructions.td  | 5 +++++
 llvm/lib/Target/AMDGPU/VOP1Instructions.td | 1 +
 llvm/lib/Target/AMDGPU/VOP2Instructions.td | 1 +
 llvm/lib/Target/AMDGPU/VOPCInstructions.td | 1 +
 llvm/lib/Target/AMDGPU/VOPInstructions.td  | 4 ++++
 9 files changed, 19 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index a904c8483dbf5..c6668b24f4ef6 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -139,6 +139,7 @@ class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> :
   let IsAtomicNoRet      = ps.IsAtomicNoRet;
   let Uses               = ps.Uses;
   let Defs               = ps.Defs;
+  let isConvergent       = ps.isConvergent;
 
   bits<12> offset;
   bits<5>  cpol;
@@ -355,6 +356,7 @@ class MUBUF_Real <MUBUF_Pseudo ps, string real_name = ps.Mnemonic> :
   let LGKM_CNT             = ps.LGKM_CNT;
   let Uses                 = ps.Uses;
   let Defs                 = ps.Defs;
+  let isConvergent         = ps.isConvergent;
 
   bits<12> offset;
   bits<5>  cpol;
@@ -2435,6 +2437,7 @@ class VBUFFER_Real <bits<8> op, BUF_Pseudo ps, string real_name> :
   let MTBUF              = ps.MTBUF;
   let Uses               = ps.Uses;
   let Defs               = ps.Defs;
+  let isConvergent       = ps.isConvergent;
 
   bits<24> offset;
   bits<8>  vaddr;
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 51da1b2b2b479..3c76fb2f7961f 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -73,6 +73,7 @@ class DS_Real <DS_Pseudo ps, string opName = ps.Mnemonic> :
   let IsAtomicNoRet      = ps.IsAtomicNoRet;
   let Uses               = ps.Uses;
   let Defs               = ps.Defs;
+  let isConvergent       = ps.isConvergent;
 
   let Constraints = ps.Constraints;
   let DisableEncoding = ps.DisableEncoding;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 88c6039473338..351563657aeb5 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -104,6 +104,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
   let VALU                 = ps.VALU;
   let Uses                 = ps.Uses;
   let Defs                 = ps.Defs;
+  let isConvergent         = ps.isConvergent;
 
   // encoding fields
   bits<8> vaddr;
@@ -169,6 +170,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
   let VALU                 = ps.VALU;
   let Uses                 = ps.Uses;
   let Defs                 = ps.Defs;
+  let isConvergent         = ps.isConvergent;
 
   bits<7> saddr;
   bits<8> vdst;
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 9e470e27272c3..4218b7d4cbd5d 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -73,6 +73,7 @@ class SM_Real <SM_Pseudo ps, string opName = ps.Mnemonic>
   let IsAtomicNoRet        = ps.IsAtomicNoRet;
   let Uses                 = ps.Uses;
   let Defs                 = ps.Defs;
+  let isConvergent         = ps.isConvergent;
 
   let TSFlags = ps.TSFlags;
 
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 0d6883254c350..de3191bd91df6 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -67,6 +67,7 @@ class SOP1_Real<bits<8> op, SOP1_Pseudo ps, string real_name = ps.Mnemonic> :
   let isBarrier          = ps.isBarrier;
   let Uses               = ps.Uses;
   let Defs               = ps.Defs;
+  let isConvergent       = ps.isConvergent;
 
   // encoding
   bits<7> sdst;
@@ -579,6 +580,7 @@ class SOP2_Real<SOP_Pseudo ps, string name = ps.Mnemonic> :
   let DisableEncoding      = ps.DisableEncoding;
   let Uses                 = ps.Uses;
   let Defs                 = ps.Defs;
+  let isConvergent         = ps.isConvergent;
 
   // encoding
   bits<7> sdst;
@@ -996,6 +998,7 @@ class SOPK_Real<SOPK_Pseudo ps, string name = ps.Mnemonic> :
   let isBarrier          = ps.isBarrier;
   let Uses               = ps.Uses;
   let Defs               = ps.Defs;
+  let isConvergent       = ps.isConvergent;
 
   // encoding
   bits<7>  sdst;
@@ -1262,6 +1265,7 @@ class SOPC_Real<bits<7> op, SOPC_Pseudo ps> :
   let mayStore             = ps.mayStore;
   let Uses                 = ps.Uses;
   let Defs                 = ps.Defs;
+  let isConvergent         = ps.isConvergent;
 
   // encoding
   bits<8> src0;
@@ -1459,6 +1463,7 @@ class SOPP_Real<SOPP_Pseudo ps, string name = ps.Mnemonic> :
   let isBarrier            = ps.isBarrier;
   let Uses                 = ps.Uses;
   let Defs                 = ps.Defs;
+  let isConvergent         = ps.isConvergent;
   bits <16> simm16;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 2c0d61ee4afa1..8bb1374a78c4d 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -88,6 +88,7 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo
   let mayLoad              = ps.mayLoad;
   let mayStore             = ps.mayStore;
   let TRANS                = ps.TRANS;
+  let isConvergent         = ps.isConvergent;
 }
 
 class VOP1_Real_Gen <VOP1_Pseudo ps, GFXGen Gen, string real_name = ps.Mnemonic> :
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 9989752c2f6bc..44eb5f5abafe0 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -107,6 +107,7 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo
   let SchedRW              = ps.SchedRW;
   let mayLoad              = ps.mayLoad;
   let mayStore             = ps.mayStore;
+  let isConvergent         = ps.isConvergent;
 }
 
 class VOP2_Real_Gen <VOP2_Pseudo ps, GFXGen Gen, string real_name = ps.Mnemonic> :
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 3bcee28a2cb78..62ca6261c47c8 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -180,6 +180,7 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily, string asm_name = ps.Pseudo
   let SchedRW              = ps.SchedRW;
   let mayLoad              = ps.mayLoad;
   let mayStore             = ps.mayStore;
+  let isConvergent         = ps.isConvergent;
 }
 
 class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index f2ed17ac305a1..3851415ab0cae 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -194,6 +194,7 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemoni
   let mayLoad              = ps.mayLoad;
   let mayStore             = ps.mayStore;
   let TRANS                = ps.TRANS;
+  let isConvergent         = ps.isConvergent;
 
   VOPProfile Pfl = ps.Pfl;
 }
@@ -653,6 +654,7 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
   let mayLoad              = ps.mayLoad;
   let mayStore             = ps.mayStore;
   let TRANS                = ps.TRANS;
+  let isConvergent         = ps.isConvergent;
 }
 
 class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
@@ -689,6 +691,7 @@ class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
   let mayLoad              = ps.mayLoad;
   let mayStore             = ps.mayStore;
   let TRANS                = ps.TRANS;
+  let isConvergent         = ps.isConvergent;
 }
 
 class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
@@ -889,6 +892,7 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
   let mayLoad              = ps.mayLoad;
   let mayStore             = ps.mayStore;
   let TRANS                = ps.TRANS;
+  let isConvergent         = ps.isConvergent;
 }
 
 class VOP_DPP_Base <string OpName, VOPProfile P,

>From d8b672dac9f8945ab35cffaa4e196e3d3e61da1f Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier at nvidia.com>
Date: Thu, 25 Jul 2024 18:02:03 +0200
Subject: [PATCH 159/259] [flang][NFC] rename fircg op operand index accessors
 (#100584)

fircg operations have xxxOffset members to give the operand index of
operand xxx. This is a bit weird when looking at usage (e.g.
`arrayCoor.shiftOffset` reads like it is shifting some offset). Rename
them to getXxxOperandIndex.
---
 .../include/flang/Optimizer/CodeGen/CGOps.td  | 74 ++++++++++++-------
 .../include/flang/Optimizer/Dialect/FIROps.td |  2 +-
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       | 45 +++++------
 3 files changed, 73 insertions(+), 48 deletions(-)

diff --git a/flang/include/flang/Optimizer/CodeGen/CGOps.td b/flang/include/flang/Optimizer/CodeGen/CGOps.td
index c375edee1fa77..f4740a263ffd2 100644
--- a/flang/include/flang/Optimizer/CodeGen/CGOps.td
+++ b/flang/include/flang/Optimizer/CodeGen/CGOps.td
@@ -78,16 +78,24 @@ def fircg_XEmboxOp : fircg_Op<"ext_embox", [AttrSizedOperandSegments]> {
     unsigned getOutRank();
 
     // The shape operands are mandatory and always start at 1.
-    unsigned shapeOffset() { return 1; }
-    unsigned shiftOffset() { return shapeOffset() + getShape().size(); }
-    unsigned sliceOffset() { return shiftOffset() + getShift().size(); }
-    unsigned subcomponentOffset() { return sliceOffset() + getSlice().size(); }
-    unsigned substrOffset() {
-      return subcomponentOffset() + getSubcomponent().size();
+    unsigned getShapeOperandIndex() { return 1; }
+    unsigned getShiftOperandIndex() {
+      return getShapeOperandIndex() + getShape().size();
     }
-    unsigned lenParamOffset() { return substrOffset() + getSubstr().size(); }
-    unsigned getSourceBoxOffset() {
-      return lenParamOffset() + getLenParams().size();
+    unsigned getSliceOperandIndex() {
+      return getShiftOperandIndex() + getShift().size();
+    }
+    unsigned getSubcomponentOperandIndex() {
+      return getSliceOperandIndex() + getSlice().size();
+    }
+    unsigned getSubstrOperandIndex() {
+      return getSubcomponentOperandIndex() + getSubcomponent().size();
+    }
+    unsigned getLenParamOperandIndex() {
+      return getSubstrOperandIndex() + getSubstr().size();
+    }
+    unsigned getSourceBoxOperandIndex() {
+      return getLenParamOperandIndex() + getLenParams().size();
     }
   }];
 }
@@ -135,12 +143,18 @@ def fircg_XReboxOp : fircg_Op<"ext_rebox", [AttrSizedOperandSegments]> {
     // The rank of the result box
     unsigned getOutRank();
 
-    unsigned shapeOffset() { return 1; }
-    unsigned shiftOffset() { return shapeOffset() + getShape().size(); }
-    unsigned sliceOffset() { return shiftOffset() + getShift().size(); }
-    unsigned subcomponentOffset() { return sliceOffset() + getSlice().size(); }
-    unsigned substrOffset() {
-      return subcomponentOffset() + getSubcomponent().size();
+    unsigned getShapeOperandIndex() { return 1; }
+    unsigned getShiftOperandIndex() {
+      return getShapeOperandIndex() + getShape().size();
+    }
+    unsigned getSliceOperandIndex() {
+      return getShiftOperandIndex() + getShift().size();
+    }
+    unsigned getSubcomponentOperandIndex() {
+      return getSliceOperandIndex() + getSlice().size();
+    }
+    unsigned getSubstrOperandIndex() {
+      return getSubcomponentOperandIndex() + getSubcomponent().size();
     }
   }];
 }
@@ -193,14 +207,22 @@ def fircg_XArrayCoorOp : fircg_Op<"ext_array_coor", [AttrSizedOperandSegments]>
     unsigned getRank();
 
     // Shape is optional, but if it exists, it will be at offset 1.
-    unsigned shapeOffset() { return 1; }
-    unsigned shiftOffset() { return shapeOffset() + getShape().size(); }
-    unsigned sliceOffset() { return shiftOffset() + getShift().size(); }
-    unsigned subcomponentOffset() { return sliceOffset() + getSlice().size(); }
-    unsigned indicesOffset() {
-      return subcomponentOffset() + getSubcomponent().size();
-    }
-    unsigned lenParamsOffset() { return indicesOffset() + getIndices().size(); }
+    unsigned getShapeOperandIndex() { return 1; }
+    unsigned getShiftOperandIndex() {
+      return getShapeOperandIndex() + getShape().size();
+    }
+    unsigned getSliceOperandIndex() {
+      return getShiftOperandIndex() + getShift().size();
+    }
+    unsigned getSubcomponentOperandIndex() {
+      return getSliceOperandIndex() + getSlice().size();
+    }
+    unsigned getIndicesOperandIndex() {
+      return getSubcomponentOperandIndex() + getSubcomponent().size();
+    }
+    unsigned getLenParamsOperandIndex() {
+    return getIndicesOperandIndex() + getIndices().size();
+    }
   }];
 }
 
@@ -231,8 +253,10 @@ def fircg_XDeclareOp : fircg_Op<"ext_declare", [AttrSizedOperandSegments]> {
 
   let extraClassDeclaration = [{
     // Shape is optional, but if it exists, it will be at offset 1.
-    unsigned shapeOffset() { return 1; }
-    unsigned shiftOffset() { return shapeOffset() + getShape().size(); }
+    unsigned getShapeOperandIndex() { return 1; }
+    unsigned getShiftOperandIndex() {
+      return getShapeOperandIndex() + getShape().size();
+    }
   }];
 }
 
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 89c13fa7cebe6..bee8e8f603ce3 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -817,7 +817,7 @@ def fir_EmboxOp : fir_Op<"embox", [NoMemoryEffect, AttrSizedOperandSegments]> {
   let extraClassDeclaration = [{
     bool hasLenParams() { return !getTypeparams().empty(); }
     unsigned numLenParams() { return getTypeparams().size(); }
-    unsigned getSourceBoxOffset() {
+    unsigned getSourceBoxOperandIndex() {
       return 1 + (getShape() ? 1 : 0) + (getSlice() ? 1 : 0)
           + numLenParams();
     }
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index f9ea92a843b23..99f1453f1314b 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -1431,8 +1431,8 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
             fir::unwrapPassByRefType(memref.getType()))))
       TODO(xbox.getLoc(),
            "fir.embox codegen dynamic size component in derived type");
-    indices.append(operands.begin() + xbox.subcomponentOffset(),
-                   operands.begin() + xbox.subcomponentOffset() +
+    indices.append(operands.begin() + xbox.getSubcomponentOperandIndex(),
+                   operands.begin() + xbox.getSubcomponentOperandIndex() +
                        xbox.getSubcomponent().size());
   }
 
@@ -1487,7 +1487,7 @@ struct EmboxOpConversion : public EmboxCommonConversion<fir::EmboxOp> {
     mlir::Value sourceBox;
     mlir::Type sourceBoxType;
     if (embox.getSourceBox()) {
-      sourceBox = operands[embox.getSourceBoxOffset()];
+      sourceBox = operands[embox.getSourceBoxOperandIndex()];
       sourceBoxType = embox.getSourceBox().getType();
     }
     assert(!embox.getShape() && "There should be no dims on this embox op");
@@ -1519,7 +1519,7 @@ struct XEmboxOpConversion : public EmboxCommonConversion<fir::cg::XEmboxOp> {
     mlir::Value sourceBox;
     mlir::Type sourceBoxType;
     if (xbox.getSourceBox()) {
-      sourceBox = operands[xbox.getSourceBoxOffset()];
+      sourceBox = operands[xbox.getSourceBoxOperandIndex()];
       sourceBoxType = xbox.getSourceBox().getType();
     }
     auto [boxTy, dest, resultEleSize] = consDescriptorPrefix(
@@ -1529,11 +1529,11 @@ struct XEmboxOpConversion : public EmboxCommonConversion<fir::cg::XEmboxOp> {
     // Generate the triples in the dims field of the descriptor
     auto i64Ty = mlir::IntegerType::get(xbox.getContext(), 64);
     assert(!xbox.getShape().empty() && "must have a shape");
-    unsigned shapeOffset = xbox.shapeOffset();
+    unsigned shapeOffset = xbox.getShapeOperandIndex();
     bool hasShift = !xbox.getShift().empty();
-    unsigned shiftOffset = xbox.shiftOffset();
+    unsigned shiftOffset = xbox.getShiftOperandIndex();
     bool hasSlice = !xbox.getSlice().empty();
-    unsigned sliceOffset = xbox.sliceOffset();
+    unsigned sliceOffset = xbox.getSliceOperandIndex();
     mlir::Location loc = xbox.getLoc();
     mlir::Value zero = genConstantIndex(loc, i64Ty, rewriter, 0);
     mlir::Value one = genConstantIndex(loc, i64Ty, rewriter, 1);
@@ -1682,7 +1682,7 @@ struct XEmboxOpConversion : public EmboxCommonConversion<fir::cg::XEmboxOp> {
       if (hasSubcomp)
         getSubcomponentIndices(xbox, xbox.getMemref(), operands, fieldIndices);
       if (hasSubstr)
-        substringOffset = operands[xbox.substrOffset()];
+        substringOffset = operands[xbox.getSubstrOperandIndex()];
       mlir::Type llvmBaseType =
           convertType(fir::unwrapRefType(xbox.getMemref().getType()));
       base = genBoxOffsetGep(rewriter, loc, base, llvmBaseType, ptrOffset,
@@ -1843,7 +1843,7 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
       if (!rebox.getSubcomponent().empty())
         getSubcomponentIndices(rebox, rebox.getBox(), operands, fieldIndices);
       if (!rebox.getSubstr().empty())
-        substringOffset = operands[rebox.substrOffset()];
+        substringOffset = operands[rebox.getSubstrOperandIndex()];
       base = genBoxOffsetGep(rewriter, loc, base, llvmBaseObjectType, zero,
                              /*cstInteriorIndices=*/std::nullopt, fieldIndices,
                              substringOffset);
@@ -1862,8 +1862,8 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
     llvm::SmallVector<mlir::Value> slicedStrides;
     mlir::Value one = genConstantIndex(loc, idxTy, rewriter, 1);
     const bool sliceHasOrigins = !rebox.getShift().empty();
-    unsigned sliceOps = rebox.sliceOffset();
-    unsigned shiftOps = rebox.shiftOffset();
+    unsigned sliceOps = rebox.getSliceOperandIndex();
+    unsigned shiftOps = rebox.getShiftOperandIndex();
     auto strideOps = inputStrides.begin();
     const unsigned inputRank = inputStrides.size();
     for (unsigned i = 0; i < inputRank;
@@ -1912,9 +1912,10 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
              mlir::Value base, mlir::ValueRange inputExtents,
              mlir::ValueRange inputStrides, mlir::ValueRange operands,
              mlir::ConversionPatternRewriter &rewriter) const {
-    mlir::ValueRange reboxShifts{operands.begin() + rebox.shiftOffset(),
-                                 operands.begin() + rebox.shiftOffset() +
-                                     rebox.getShift().size()};
+    mlir::ValueRange reboxShifts{
+        operands.begin() + rebox.getShiftOperandIndex(),
+        operands.begin() + rebox.getShiftOperandIndex() +
+            rebox.getShift().size()};
     if (rebox.getShape().empty()) {
       // Only setting new lower bounds.
       return finalizeRebox(rebox, destBoxTy, dest, base, reboxShifts,
@@ -1934,7 +1935,7 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
                              ? genConstantIndex(loc, idxTy, rewriter, 1)
                              : inputStrides[0];
     for (unsigned i = 0; i < rebox.getShape().size(); ++i) {
-      mlir::Value rawExtent = operands[rebox.shapeOffset() + i];
+      mlir::Value rawExtent = operands[rebox.getShapeOperandIndex() + i];
       mlir::Value extent = integerCast(loc, rewriter, idxTy, rawExtent);
       newExtents.emplace_back(extent);
       newStrides.emplace_back(stride);
@@ -2137,10 +2138,10 @@ struct XArrayCoorOpConversion
     assert(coor.getShift().empty() || coor.getShift().size() == rank);
     assert(coor.getSlice().empty() || coor.getSlice().size() == 3 * rank);
     mlir::Type idxTy = lowerTy().indexType();
-    unsigned indexOffset = coor.indicesOffset();
-    unsigned shapeOffset = coor.shapeOffset();
-    unsigned shiftOffset = coor.shiftOffset();
-    unsigned sliceOffset = coor.sliceOffset();
+    unsigned indexOffset = coor.getIndicesOperandIndex();
+    unsigned shapeOffset = coor.getShapeOperandIndex();
+    unsigned shiftOffset = coor.getShiftOperandIndex();
+    unsigned sliceOffset = coor.getSliceOperandIndex();
     auto sliceOps = coor.getSlice().begin();
     mlir::Value one = genConstantIndex(loc, idxTy, rewriter, 1);
     mlir::Value prevExt = one;
@@ -2238,7 +2239,7 @@ struct XArrayCoorOpConversion
       }
       llvm::SmallVector<mlir::Value> indices = convertSubcomponentIndices(
           loc, elementType,
-          operands.slice(coor.subcomponentOffset(),
+          operands.slice(coor.getSubcomponentOperandIndex(),
                          coor.getSubcomponent().size()));
       args.append(indices.begin(), indices.end());
       rewriter.replaceOpWithNewOp<mlir::LLVM::GEPOp>(coor, llvmPtrTy,
@@ -2262,7 +2263,7 @@ struct XArrayCoorOpConversion
         if (fir::characterWithDynamicLen(eleTy)) {
           assert(coor.getLenParams().size() == 1);
           auto length = integerCast(loc, rewriter, idxTy,
-                                    operands[coor.lenParamsOffset()]);
+                                    operands[coor.getLenParamsOperandIndex()]);
           offset = rewriter.create<mlir::LLVM::MulOp>(loc, idxTy, offset,
                                                       length, nsw);
         } else {
@@ -2275,7 +2276,7 @@ struct XArrayCoorOpConversion
       args.push_back(offset);
       llvm::SmallVector<mlir::Value> indices = convertSubcomponentIndices(
           loc, gepObjectType,
-          operands.slice(coor.subcomponentOffset(),
+          operands.slice(coor.getSubcomponentOperandIndex(),
                          coor.getSubcomponent().size()));
       args.append(indices.begin(), indices.end());
     }

>From 3295d377f37a60597321f502d164b5d6b1948e28 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav at gmail.com>
Date: Thu, 25 Jul 2024 20:15:14 +0400
Subject: [PATCH 160/259] [clang] Remove `__is_layout_compatible` from
 revertible type traits list (#100572)

`__is_layout_compatible` was added in Clang 19 (#81506), and at that
time it wasn't entirely clear whether it should be a revertible type
trait or not. We decided to follow the example of other type traits.
Since then #95969 happened, and now we know that we don't want new
revertible type traits.

This patch removes `__is_layout_compatible` from revertible type traits
list, and leaves a comment what revertible type traits are, and that new
type traits should not be added there.

The intention is to also cherry-pick this to 19 branch.
---
 clang/lib/Parse/ParseExpr.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index 0a017ae79de75..e82b565272831 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -763,6 +763,9 @@ class CastExpressionIdValidator final : public CorrectionCandidateCallback {
 bool Parser::isRevertibleTypeTrait(const IdentifierInfo *II,
                                    tok::TokenKind *Kind) {
   if (RevertibleTypeTraits.empty()) {
+// Revertible type trait is a feature for backwards compatibility with older
+// standard libraries that declare their own structs with the same name as
+// the builtins listed below. New builtins should NOT be added to this list.
 #define RTT_JOIN(X, Y) X##Y
 #define REVERTIBLE_TYPE_TRAIT(Name)                                            \
   RevertibleTypeTraits[PP.getIdentifierInfo(#Name)] = RTT_JOIN(tok::kw_, Name)
@@ -790,7 +793,6 @@ bool Parser::isRevertibleTypeTrait(const IdentifierInfo *II,
     REVERTIBLE_TYPE_TRAIT(__is_fundamental);
     REVERTIBLE_TYPE_TRAIT(__is_integral);
     REVERTIBLE_TYPE_TRAIT(__is_interface_class);
-    REVERTIBLE_TYPE_TRAIT(__is_layout_compatible);
     REVERTIBLE_TYPE_TRAIT(__is_literal);
     REVERTIBLE_TYPE_TRAIT(__is_lvalue_expr);
     REVERTIBLE_TYPE_TRAIT(__is_lvalue_reference);

>From 817cd726454f01e990cd84e5e1d339b120b5ebaa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mirko=20Brku=C5=A1anin?= <Mirko.Brkusanin at amd.com>
Date: Thu, 25 Jul 2024 18:19:26 +0200
Subject: [PATCH 161/259] [AMDGPU] Fix folding clamp into pseudo scalar
 instructions (#100568)

Clamp is canonically a v_max* instruction with a VGPR dst. Folding clamp
into a pseudo scalar instruction can cause issues due to a change in
regbank. We fix this with a copy.
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp     | 13 +++++++++-
 .../CodeGen/AMDGPU/si-fold-scalar-clamp.mir   | 26 +++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/si-fold-scalar-clamp.mir

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 9b2cab2eb73a3..32ecf350db59c 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1581,7 +1581,18 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
 
   // Clamp is applied after omod, so it is OK if omod is set.
   DefClamp->setImm(1);
-  MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
+
+  Register DefReg = Def->getOperand(0).getReg();
+  Register MIDstReg = MI.getOperand(0).getReg();
+  if (TRI->isSGPRReg(*MRI, DefReg)) {
+    // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
+    // instruction with a VGPR dst.
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
+            MIDstReg)
+        .addReg(DefReg);
+  } else {
+    MRI->replaceRegWith(MIDstReg, DefReg);
+  }
   MI.eraseFromParent();
 
   // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-scalar-clamp.mir b/llvm/test/CodeGen/AMDGPU/si-fold-scalar-clamp.mir
new file mode 100644
index 0000000000000..1f4d046a8739f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-scalar-clamp.mir
@@ -0,0 +1,26 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass=si-fold-operands -verify-machineinstrs -o - %s | FileCheck %s
+---
+name:            test
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0
+
+    ; CHECK-LABEL: name: test
+    ; CHECK: liveins: $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; CHECK-NEXT: [[V_S_RSQ_F32_e64_:%[0-9]+]]:sgpr_32 = nofpexcept V_S_RSQ_F32_e64 0, [[COPY]], 1, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_S_RSQ_F32_e64_]]
+    ; CHECK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[COPY1]], [[COPY1]], implicit $mode, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_F32_e32_]]
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = nofpexcept V_S_RSQ_F32_e64 0, %0, 0, 0, implicit $mode, implicit $exec
+    %2:vgpr_32 = nofpexcept V_MAX_F32_e64 0, %1, 0, %1, -1, 0, implicit $mode, implicit $exec
+    %3:vgpr_32 = nofpexcept V_ADD_F32_e32 %2:vgpr_32, %2:vgpr_32, implicit $mode, implicit $exec
+    $vgpr0 = COPY %3
+    S_ENDPGM 0
+
+...

>From e90d552c77428b93b435eaafa81e396cdaf60694 Mon Sep 17 00:00:00 2001
From: "Mikhail R. Gadelha" <mikhail at igalia.com>
Date: Thu, 25 Jul 2024 18:25:09 +0200
Subject: [PATCH 162/259] [libc][NFC] Update riscv documentation (#100578)

This adds linux-riscv32 to the documentation and fixes riscv's
entrypoint broken link.
---
 libc/docs/date_and_time.rst | 4 ++--
 libc/docs/math/index.rst    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libc/docs/date_and_time.rst b/libc/docs/date_and_time.rst
index 303dd3fa12df2..6207375375217 100644
--- a/libc/docs/date_and_time.rst
+++ b/libc/docs/date_and_time.rst
@@ -22,7 +22,7 @@ Implementation Status
 
   - `linux-aarch32 <https://github.com/llvm/llvm-project/tree/main/libc/config/linux/arm/entrypoints.txt>`_
 
-  - `linux-riscv64 <https://github.com/llvm/llvm-project/tree/main/libc/config/linux/riscv64/entrypoints.txt>`_
+  - `linux-riscv and linux-riscv32 <https://github.com/llvm/llvm-project/tree/main/libc/config/linux/riscv/entrypoints.txt>`_
 
 * To check date and time functions enabled for Windows:
 
@@ -49,7 +49,7 @@ Implementation Status
 +---------------------+---------------------------------------+-------------------+-------------------+-------------------+-------------------+
 | <Func>              |  Linux                                | Windows           | MacOS             | Embedded          | GPU               |
 |                     +---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-|                     | x86_64  | aarch64 | aarch32 | riscv64 | x86_64  | aarch64 | x86_64  | aarch64 | aarch32 | riscv32 | AMD     | nVidia  |
+|                     | x86_64  | aarch64 | aarch32 | riscv64/riscv32 | x86_64  | aarch64 | x86_64  | aarch64 | aarch32 | riscv32 | AMD     | nVidia  |
 +=====================+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+
 | asctime             | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
 +---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 64de548a1ea1c..e7db07f071a8c 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -77,7 +77,7 @@ Implementation Status
 
   - `linux-aarch32 <https://github.com/llvm/llvm-project/tree/main/libc/config/linux/arm/entrypoints.txt>`_
 
-  - `linux-riscv64 <https://github.com/llvm/llvm-project/tree/main/libc/config/linux/riscv64/entrypoints.txt>`_
+  - `linux-riscv and linux-riscv32 <https://github.com/llvm/llvm-project/tree/main/libc/config/linux/riscv/entrypoints.txt>`_
 
 * To check math functions enabled for Windows:
 

>From 1db7d2ae226c96e0243309e1688c6e0cfb9236bd Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Thu, 25 Jul 2024 17:49:29 +0800
Subject: [PATCH 163/259] [ELF] Add missing `static` decl on several helpers;
 NFC

---
 llvm/include/llvm/Object/ELF.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/Object/ELF.h b/llvm/include/llvm/Object/ELF.h
index 6bf42de89e1c4..8a363741ee628 100644
--- a/llvm/include/llvm/Object/ELF.h
+++ b/llvm/include/llvm/Object/ELF.h
@@ -127,8 +127,8 @@ template <class T> struct DataRegion {
 };
 
 template <class ELFT>
-std::string getSecIndexForError(const ELFFile<ELFT> &Obj,
-                                const typename ELFT::Shdr &Sec) {
+static std::string getSecIndexForError(const ELFFile<ELFT> &Obj,
+                                       const typename ELFT::Shdr &Sec) {
   auto TableOrErr = Obj.sections();
   if (TableOrErr)
     return "[index " + std::to_string(&Sec - &TableOrErr->front()) + "]";
@@ -151,8 +151,8 @@ static std::string describe(const ELFFile<ELFT> &Obj,
 }
 
 template <class ELFT>
-std::string getPhdrIndexForError(const ELFFile<ELFT> &Obj,
-                                 const typename ELFT::Phdr &Phdr) {
+static std::string getPhdrIndexForError(const ELFFile<ELFT> &Obj,
+                                        const typename ELFT::Phdr &Phdr) {
   auto Headers = Obj.program_headers();
   if (Headers)
     return ("[index " + Twine(&Phdr - &Headers->front()) + "]").str();
@@ -166,8 +166,8 @@ static inline Error defaultWarningHandler(const Twine &Msg) {
 }
 
 template <class ELFT>
-bool checkSectionOffsets(const typename ELFT::Phdr &Phdr,
-                         const typename ELFT::Shdr &Sec) {
+static bool checkSectionOffsets(const typename ELFT::Phdr &Phdr,
+                                const typename ELFT::Shdr &Sec) {
   // SHT_NOBITS sections don't need to have an offset inside the segment.
   if (Sec.sh_type == ELF::SHT_NOBITS)
     return true;
@@ -184,8 +184,8 @@ bool checkSectionOffsets(const typename ELFT::Phdr &Phdr,
 // Check that an allocatable section belongs to a virtual address
 // space of a segment.
 template <class ELFT>
-bool checkSectionVMA(const typename ELFT::Phdr &Phdr,
-                     const typename ELFT::Shdr &Sec) {
+static bool checkSectionVMA(const typename ELFT::Phdr &Phdr,
+                            const typename ELFT::Shdr &Sec) {
   if (!(Sec.sh_flags & ELF::SHF_ALLOC))
     return true;
 
@@ -203,8 +203,8 @@ bool checkSectionVMA(const typename ELFT::Phdr &Phdr,
 }
 
 template <class ELFT>
-bool isSectionInSegment(const typename ELFT::Phdr &Phdr,
-                        const typename ELFT::Shdr &Sec) {
+static bool isSectionInSegment(const typename ELFT::Phdr &Phdr,
+                               const typename ELFT::Shdr &Sec) {
   return checkSectionOffsets<ELFT>(Phdr, Sec) &&
          checkSectionVMA<ELFT>(Phdr, Sec);
 }
@@ -212,7 +212,7 @@ bool isSectionInSegment(const typename ELFT::Phdr &Phdr,
 // HdrHandler is called once with the number of relocations and whether the
 // relocations have addends. EntryHandler is called once per decoded relocation.
 template <bool Is64>
-Error decodeCrel(
+static Error decodeCrel(
     ArrayRef<uint8_t> Content,
     function_ref<void(uint64_t /*relocation count*/, bool /*explicit addends*/)>
         HdrHandler,

>From 342328db6b2a69cf59c01a6115b1baab925c0736 Mon Sep 17 00:00:00 2001
From: Zahira Ammarguellat <zahira.ammarguellat at intel.com>
Date: Thu, 25 Jul 2024 09:32:34 -0700
Subject: [PATCH 164/259] [CLANG] Emit warning in finite math mode when INF and
 NAN are used. (#99672)

In finite math mode when special math builtins `__builtin_inf` and
`__builtin_nan` are used a warning is emitted when the builtin is
expanded and at call point.
This warning at call point was missing for` __builtin_inf` and this
patch fixes the issue
(https://github.com/llvm/llvm-project/issues/98018).
---
 clang/lib/Sema/SemaChecking.cpp               | 45 +++++++++++++++----
 clang/test/Headers/float.c                    |  5 ++-
 .../Sema/warn-infinity-nan-disabled-lnx.cpp   | 28 ++++++++----
 3 files changed, 59 insertions(+), 19 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index cf1196ad23c21..99a1471d9d06d 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -8202,20 +8202,47 @@ static bool IsStdFunction(const FunctionDecl *FDecl,
   return true;
 }
 
+enum class MathCheck { NaN, Inf };
+static bool IsInfOrNanFunction(StringRef calleeName, MathCheck Check) {
+  auto MatchesAny = [&](std::initializer_list<llvm::StringRef> names) {
+    return std::any_of(names.begin(), names.end(), [&](llvm::StringRef name) {
+      return calleeName == name;
+    });
+  };
+
+  switch (Check) {
+  case MathCheck::NaN:
+    return MatchesAny({"__builtin_nan", "__builtin_nanf", "__builtin_nanl",
+                       "__builtin_nanf16", "__builtin_nanf128"});
+  case MathCheck::Inf:
+    return MatchesAny({"__builtin_inf", "__builtin_inff", "__builtin_infl",
+                       "__builtin_inff16", "__builtin_inff128"});
+  default:
+    return false;
+  }
+}
+
 void Sema::CheckInfNaNFunction(const CallExpr *Call,
                                const FunctionDecl *FDecl) {
   FPOptions FPO = Call->getFPFeaturesInEffect(getLangOpts());
-  if ((IsStdFunction(FDecl, "isnan") || IsStdFunction(FDecl, "isunordered") ||
-       (Call->getBuiltinCallee() == Builtin::BI__builtin_nanf)) &&
-      FPO.getNoHonorNaNs())
+  bool HasIdentifier = FDecl->getIdentifier() != nullptr;
+  bool IsNaNOrIsUnordered =
+      IsStdFunction(FDecl, "isnan") || IsStdFunction(FDecl, "isunordered");
+  bool IsSpecialNaN =
+      HasIdentifier && IsInfOrNanFunction(FDecl->getName(), MathCheck::NaN);
+  if ((IsNaNOrIsUnordered || IsSpecialNaN) && FPO.getNoHonorNaNs()) {
     Diag(Call->getBeginLoc(), diag::warn_fp_nan_inf_when_disabled)
         << 1 << 0 << Call->getSourceRange();
-  else if ((IsStdFunction(FDecl, "isinf") ||
-            (IsStdFunction(FDecl, "isfinite") ||
-             (FDecl->getIdentifier() && FDecl->getName() == "infinity"))) &&
-           FPO.getNoHonorInfs())
-    Diag(Call->getBeginLoc(), diag::warn_fp_nan_inf_when_disabled)
-        << 0 << 0 << Call->getSourceRange();
+  } else {
+    bool IsInfOrIsFinite =
+        IsStdFunction(FDecl, "isinf") || IsStdFunction(FDecl, "isfinite");
+    bool IsInfinityOrIsSpecialInf =
+        HasIdentifier && ((FDecl->getName() == "infinity") ||
+                          IsInfOrNanFunction(FDecl->getName(), MathCheck::Inf));
+    if ((IsInfOrIsFinite || IsInfinityOrIsSpecialInf) && FPO.getNoHonorInfs())
+      Diag(Call->getBeginLoc(), diag::warn_fp_nan_inf_when_disabled)
+          << 0 << 0 << Call->getSourceRange();
+  }
 }
 
 void Sema::CheckAbsoluteValueFunction(const CallExpr *Call,
diff --git a/clang/test/Headers/float.c b/clang/test/Headers/float.c
index d524d0e53f3fd..051c50561c252 100644
--- a/clang/test/Headers/float.c
+++ b/clang/test/Headers/float.c
@@ -223,8 +223,9 @@
   #ifndef NAN
     #error "Mandatory macro NAN is missing."
   #endif
-  // FIXME: the NAN diagnostic should only be issued once, not twice.
-  _Static_assert(_Generic(INFINITY, float : 1, default : 0), ""); // finite-warning {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// FIXME: the NAN and INF diagnostics should only be issued once, not twice.
+  _Static_assert(_Generic(INFINITY, float : 1, default : 0), ""); // finite-warning {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}} \
+								  finite-warning {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
   _Static_assert(_Generic(NAN, float : 1, default : 0), ""); // finite-warning {{use of NaN is undefined behavior due to the currently enabled floating-point options}} \
                                                                 finite-warning {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
 
diff --git a/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp b/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
index 03a432e05851d..d18aaad0e6925 100644
--- a/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
+++ b/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
@@ -87,11 +87,15 @@ class numeric_limits<double>  {
 
 int compareit(float a, float b) {
   volatile int i, j, k, l, m, n, o, p;
-// no-inf-no-nan-warning at +2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning at +4 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning at +3 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning at +2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
 // no-inf-warning at +1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
   i = a == INFINITY;
 
-// no-inf-no-nan-warning at +2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning at +4 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning at +3 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning at +2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
 // no-inf-warning at +1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
   j = INFINITY == a;
 
@@ -107,11 +111,15 @@ int compareit(float a, float b) {
 // no-nan-warning at +1 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
   j = NAN == a;
 
-// no-inf-no-nan-warning at +2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning at +4 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning at +3 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning at +2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
 // no-inf-warning at +1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
   j = INFINITY <= a;
 
-// no-inf-no-nan-warning at +2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning at +4 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning at +3 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning at +2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
 // no-inf-warning at +1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
   j = INFINITY < a;
 
@@ -192,7 +200,9 @@ int compareit(float a, float b) {
 // no-nan-warning at +1 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
   j = isunorderedf(a, NAN);
 
-// no-inf-no-nan-warning at +2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning at +4 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning at +3 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning at +2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
 // no-inf-warning at +1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
   j = isunorderedf(a, INFINITY);
 
@@ -204,9 +214,11 @@ int compareit(float a, float b) {
 // no-nan-warning at +1 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
   i = std::isunordered(a, NAN);
 
-// no-inf-no-nan-warning at +4 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
-// no-inf-no-nan-warning at +3 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
-// no-inf-warning at +2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning at +6 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning at +5 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning at +4 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning at +3 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning at +2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
 // no-nan-warning at +1 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
   i = std::isunordered(a, INFINITY);
 

>From 3ce6f68ee736e2e6afaa2fe5fcde71f26b21d0cf Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq at xs4all.nl>
Date: Thu, 25 Jul 2024 18:37:36 +0200
Subject: [PATCH 165/259] [libc++][spaceship] Marks P1614 as complete. (#99375)

Implements parts of:
- P1902R1 Missing feature-test macros 2017-2019

Completes:
- P1614R2 The Mothership has Landed

Fixes #100018
---
 libcxx/docs/FeatureTestMacroTable.rst              |  2 +-
 libcxx/docs/ReleaseNotes/19.rst                    |  1 +
 libcxx/docs/Status/Cxx20.rst                       |  1 +
 libcxx/docs/Status/Cxx20Papers.csv                 |  2 +-
 libcxx/docs/Status/SpaceshipPapers.csv             |  2 +-
 libcxx/include/version                             |  4 ++--
 .../compare.version.compile.pass.cpp               | 14 +++++++-------
 .../version.version.compile.pass.cpp               | 14 +++++++-------
 .../generate_feature_test_macro_components.py      |  3 +--
 9 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index 262da3f8937d2..a1506e115fe70 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -290,7 +290,7 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_syncbuf``                                      ``201803L``
     ---------------------------------------------------------- -----------------
-    ``__cpp_lib_three_way_comparison``                         ``201711L``
+    ``__cpp_lib_three_way_comparison``                         ``201907L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_to_address``                                   ``201711L``
     ---------------------------------------------------------- -----------------
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index 924ab29b12859..3d79def336a87 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -53,6 +53,7 @@ Implemented Papers
 ------------------
 
 - P1132R8 - ``out_ptr`` - a scalable output pointer abstraction
+- P1614R2 - The Mothership has Landed
 - P2637R3 - Member ``visit``
 - P2652R2 - Disallow User Specialization of ``allocator_traits``
 - P2819R2 - Add ``tuple`` protocol to ``complex``
diff --git a/libcxx/docs/Status/Cxx20.rst b/libcxx/docs/Status/Cxx20.rst
index c00d6fb237286..b76e30fbb3712 100644
--- a/libcxx/docs/Status/Cxx20.rst
+++ b/libcxx/docs/Status/Cxx20.rst
@@ -48,6 +48,7 @@ Paper Status
    .. [#note-P0883.1] P0883: shared_ptr and floating-point changes weren't applied as they themselves aren't implemented yet.
    .. [#note-P0883.2] P0883: ``ATOMIC_FLAG_INIT`` was marked deprecated in version 14.0, but was undeprecated with the implementation of LWG3659 in version 15.0.
    .. [#note-P0660] P0660: The paper is implemented but the features are experimental and can be enabled via ``-fexperimental-library``.
+   .. [#note-P1614] P1614: ``std::strong_order(long double, long double)`` is partly implemented.
    .. [#note-P0355] P0355: The implementation status is:
 
       * ``Calendars`` mostly done in Clang 7
diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv
index 34fc5586f74d9..4015d7ad48b06 100644
--- a/libcxx/docs/Status/Cxx20Papers.csv
+++ b/libcxx/docs/Status/Cxx20Papers.csv
@@ -123,7 +123,7 @@
 "`P1522R1 <https://wg21.link/P1522R1>`__","LWG","Iterator Difference Type and Integer Overflow","Cologne","|Complete|","15.0","|ranges|"
 "`P1523R1 <https://wg21.link/P1523R1>`__","LWG","Views and Size Types","Cologne","|Complete|","15.0","|ranges|"
 "`P1612R1 <https://wg21.link/P1612R1>`__","LWG","Relocate Endian's Specification","Cologne","|Complete|","10.0"
-"`P1614R2 <https://wg21.link/P1614R2>`__","LWG","The Mothership has Landed","Cologne","|In Progress|",""
+"`P1614R2 <https://wg21.link/P1614R2>`__","LWG","The Mothership has Landed","Cologne","|Complete| [#note-P1614]_","19.0"
 "`P1638R1 <https://wg21.link/P1638R1>`__","LWG","basic_istream_view::iterator should not be copyable","Cologne","|Complete|","16.0","|ranges|"
 "`P1643R1 <https://wg21.link/P1643R1>`__","LWG","Add wait/notify to atomic_ref","Cologne","|Complete|","19.0"
 "`P1644R0 <https://wg21.link/P1644R0>`__","LWG","Add wait/notify to atomic<shared_ptr>","Cologne","",""
diff --git a/libcxx/docs/Status/SpaceshipPapers.csv b/libcxx/docs/Status/SpaceshipPapers.csv
index 39e1f968c1754..1ab64a9caf86a 100644
--- a/libcxx/docs/Status/SpaceshipPapers.csv
+++ b/libcxx/docs/Status/SpaceshipPapers.csv
@@ -1,5 +1,5 @@
 "Number","Name","Status","First released version"
-`P1614R2 <https://wg21.link/P1614R2>`_,The Mothership has Landed,|In Progress|,
+`P1614R2 <https://wg21.link/P1614R2>`_,The Mothership has Landed,|Complete|,19.0
 `P2404R3 <https://wg21.link/P2404R3>`_,"Relaxing ``equality_comparable_with``'s, ``totally_ordered_with``'s, and ``three_way_comparable_with``'s common reference requirements to support move-only types",,
 `LWG3330 <https://wg21.link/LWG3330>`_,Include ``<compare>`` from most library headers,"|Complete|","13.0"
 `LWG3347 <https://wg21.link/LWG3347>`_,"``std::pair<T, U>`` now requires ``T`` and ``U`` to be *less-than-comparable*",|Nothing To Do|,
diff --git a/libcxx/include/version b/libcxx/include/version
index 40548098a92d6..fe64343eafbc9 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -238,7 +238,7 @@ __cpp_lib_string_view                                   202403L <string> <string
 __cpp_lib_submdspan                                     202306L <mdspan>
 __cpp_lib_syncbuf                                       201803L <syncstream>
 __cpp_lib_text_encoding                                 202306L <text_encoding>
-__cpp_lib_three_way_comparison                          201711L <compare>
+__cpp_lib_three_way_comparison                          201907L <compare>
 __cpp_lib_to_address                                    201711L <memory>
 __cpp_lib_to_array                                      201907L <array>
 __cpp_lib_to_chars                                      202306L <charconv>
@@ -446,7 +446,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_SYNCSTREAM)
 #   define __cpp_lib_syncbuf                            201803L
 # endif
-# define __cpp_lib_three_way_comparison                 201711L
+# define __cpp_lib_three_way_comparison                 201907L
 # define __cpp_lib_to_address                           201711L
 # define __cpp_lib_to_array                             201907L
 # define __cpp_lib_type_identity                        201806L
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp
index aac00f20c7b45..1d61f43f9ee51 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp
@@ -16,7 +16,7 @@
 // Test the feature test macros defined by <compare>
 
 /*  Constant                          Value
-    __cpp_lib_three_way_comparison    201711L [C++20]
+    __cpp_lib_three_way_comparison    201907L [C++20]
 */
 
 #include <compare>
@@ -45,8 +45,8 @@
 # ifndef __cpp_lib_three_way_comparison
 #   error "__cpp_lib_three_way_comparison should be defined in c++20"
 # endif
-# if __cpp_lib_three_way_comparison != 201711L
-#   error "__cpp_lib_three_way_comparison should have the value 201711L in c++20"
+# if __cpp_lib_three_way_comparison != 201907L
+#   error "__cpp_lib_three_way_comparison should have the value 201907L in c++20"
 # endif
 
 #elif TEST_STD_VER == 23
@@ -54,8 +54,8 @@
 # ifndef __cpp_lib_three_way_comparison
 #   error "__cpp_lib_three_way_comparison should be defined in c++23"
 # endif
-# if __cpp_lib_three_way_comparison != 201711L
-#   error "__cpp_lib_three_way_comparison should have the value 201711L in c++23"
+# if __cpp_lib_three_way_comparison != 201907L
+#   error "__cpp_lib_three_way_comparison should have the value 201907L in c++23"
 # endif
 
 #elif TEST_STD_VER > 23
@@ -63,8 +63,8 @@
 # ifndef __cpp_lib_three_way_comparison
 #   error "__cpp_lib_three_way_comparison should be defined in c++26"
 # endif
-# if __cpp_lib_three_way_comparison != 201711L
-#   error "__cpp_lib_three_way_comparison should have the value 201711L in c++26"
+# if __cpp_lib_three_way_comparison != 201907L
+#   error "__cpp_lib_three_way_comparison should have the value 201907L in c++26"
 # endif
 
 #endif // TEST_STD_VER > 23
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index f26e7dc4b4c63..b8bad696f1bae 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -221,7 +221,7 @@
     __cpp_lib_submdspan                                     202306L [C++26]
     __cpp_lib_syncbuf                                       201803L [C++20]
     __cpp_lib_text_encoding                                 202306L [C++26]
-    __cpp_lib_three_way_comparison                          201711L [C++20]
+    __cpp_lib_three_way_comparison                          201907L [C++20]
     __cpp_lib_to_address                                    201711L [C++20]
     __cpp_lib_to_array                                      201907L [C++20]
     __cpp_lib_to_chars                                      201611L [C++17]
@@ -4438,8 +4438,8 @@
 # ifndef __cpp_lib_three_way_comparison
 #   error "__cpp_lib_three_way_comparison should be defined in c++20"
 # endif
-# if __cpp_lib_three_way_comparison != 201711L
-#   error "__cpp_lib_three_way_comparison should have the value 201711L in c++20"
+# if __cpp_lib_three_way_comparison != 201907L
+#   error "__cpp_lib_three_way_comparison should have the value 201907L in c++20"
 # endif
 
 # ifndef __cpp_lib_to_address
@@ -6037,8 +6037,8 @@
 # ifndef __cpp_lib_three_way_comparison
 #   error "__cpp_lib_three_way_comparison should be defined in c++23"
 # endif
-# if __cpp_lib_three_way_comparison != 201711L
-#   error "__cpp_lib_three_way_comparison should have the value 201711L in c++23"
+# if __cpp_lib_three_way_comparison != 201907L
+#   error "__cpp_lib_three_way_comparison should have the value 201907L in c++23"
 # endif
 
 # ifndef __cpp_lib_to_address
@@ -7960,8 +7960,8 @@
 # ifndef __cpp_lib_three_way_comparison
 #   error "__cpp_lib_three_way_comparison should be defined in c++26"
 # endif
-# if __cpp_lib_three_way_comparison != 201711L
-#   error "__cpp_lib_three_way_comparison should have the value 201711L in c++26"
+# if __cpp_lib_three_way_comparison != 201907L
+#   error "__cpp_lib_three_way_comparison should have the value 201907L in c++26"
 # endif
 
 # ifndef __cpp_lib_to_address
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index a351112471295..6c42748002aee 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -1302,8 +1302,7 @@ def add_version_header(tc):
         },
         {
             "name": "__cpp_lib_three_way_comparison",
-            "values": {"c++20": 201711},
-            # {"c++20": 201907} # P1614R2 The Mothership has Landed (see P1902R1 Missing feature-test macros 2017-2019)
+            "values": {"c++20": 201907},
             "headers": ["compare"],
         },
         {

>From 06134540126e3b82b66e82d577ac194584e76153 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <akash.banerjee at amd.com>
Date: Thu, 25 Jul 2024 17:46:18 +0100
Subject: [PATCH 166/259] [OpenMP] Fix OpenMPIRBuilder generating incorrect
 duplicate SrcLocInfo (#100364)

This should further fix some of the incorrect debug info being generated
related to #97458
---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 98da6e2efcb5c..45b568ab5248e 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3426,11 +3426,6 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
   else
     Config.setGridValue(getGridValue(T, ReductionFunc));
 
-  uint32_t SrcLocStrSize;
-  Constant *SrcLocStr = getOrCreateDefaultSrcLocStr(SrcLocStrSize);
-  Value *RTLoc =
-      getOrCreateIdent(SrcLocStr, SrcLocStrSize, omp::IdentFlag(0), 0);
-
   // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
   // RedList, shuffle_reduce_func, interwarp_copy_func);
   // or
@@ -3483,7 +3478,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
         Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
     Value *WcFuncCast =
         Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy);
-    Value *Args[] = {RTLoc, ReductionDataSize, RL, SarFuncCast, WcFuncCast};
+    Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
+                     WcFuncCast};
     Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
         RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
     Res = Builder.CreateCall(Pv2Ptr, Args);
@@ -3506,7 +3502,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
     Value *KernelTeamsReductionPtr = Builder.CreateCall(
         RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
 
-    Value *Args3[] = {RTLoc,
+    Value *Args3[] = {SrcLocInfo,
                       KernelTeamsReductionPtr,
                       Builder.getInt32(ReductionBufNum),
                       ReductionDataSize,

>From f67fa3be4db68afc08c7f3d9523f1533fa5687b7 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Thu, 25 Jul 2024 17:46:48 +0100
Subject: [PATCH 167/259] [AArch64] Remove special-case inserted shuffle cost.

This special case tried to measure if the shuffle vector will be multiple
inserts into an existing vector, with one of the lanes already in-place. If so
it reduces the cost by 1 to to represent it will can insert n-1 vector lanes.
This isn't always true though as the original vector may need to be moved to a
new value to start inserting new values into it, if other values from the
original are still needed.

This didn't effect performance much when I tried it, but should hopefully start
to address a regression we see from differences in SLP vectorization lane
orders.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |   7 +-
 .../CostModel/AArch64/shuffle-other.ll        |  26 ++--
 .../VectorCombine/AArch64/select-shuffle.ll   | 140 +++++++++---------
 3 files changed, 84 insertions(+), 89 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 45148449dfb82..7de813f603264 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4065,15 +4065,12 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
           NMask.push_back(MaskElt % LTNumElts);
       }
       // If the sub-mask has at most 2 input sub-vectors then re-cost it using
-      // getShuffleCost. If not then cost it using the worst case.
+      // getShuffleCost. If not then cost it using the worst case as the number
+      // of element moves into a new vector.
       if (NumSources <= 2)
         Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
                                                : TTI::SK_PermuteTwoSrc,
                                NTp, NMask, CostKind, 0, nullptr, Args, CxtI);
-      else if (any_of(enumerate(NMask), [&](const auto &ME) {
-                 return ME.value() % LTNumElts == ME.index();
-               }))
-        Cost += LTNumElts - 1;
       else
         Cost += LTNumElts;
     }
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
index d67f056366104..7f4030a81e749 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
@@ -367,7 +367,7 @@ define void @multipart() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32a = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32a4 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32idrev = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 31, i32 30, i32 29, i32 28>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32many2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 1, i32 4, i32 8, i32 12, i32 17, i32 20, i32 24, i32 28, i32 2, i32 6, i32 11, i32 14, i32 18, i32 22, i32 27, i32 30>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> <i32 2, i32 3, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64a = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
@@ -409,15 +409,15 @@ define void @vst3(ptr %p) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v64i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v64i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
@@ -453,15 +453,15 @@ define void @vst4(ptr %p) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll
index b49f3c9f3eeb2..775ad4c5ecc36 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll
@@ -22,13 +22,11 @@ define <16 x i32> @test1(<16 x i32> %x, <16 x i32> %y) {
 
 define i32 @test1_reduce(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: @test1_reduce(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 6, i32 9, i32 12, i32 15, i32 21, i32 24, i32 27, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 23, i32 26, i32 29, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 4, i32 7, i32 10, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 31, i32 28, i32 25, i32 22, i32 5, i32 8, i32 11, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 6, i32 23, i32 24, i32 9, i32 26, i32 27, i32 12, i32 29, i32 30, i32 15>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 31, i32 28, i32 25, i32 22, i32 5, i32 4, i32 19, i32 8, i32 7, i32 18, i32 11, i32 10, i32 17, i32 14, i32 13, i32 16>
+; CHECK-NEXT:    [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]]
+; CHECK-NEXT:    [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]]
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 21, i32 22, i32 7, i32 24, i32 25, i32 10, i32 27, i32 28, i32 13, i32 30, i32 31>
 ; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[S3]])
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
@@ -741,70 +739,70 @@ define i32 @full_reorder(ptr nocapture noundef readonly %pix1, i32 noundef %i_pi
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> [[TMP18]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP20:%.*]] = zext <16 x i8> [[TMP19]] to <16 x i32>
-; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32>
-; CHECK-NEXT:    [[TMP30:%.*]] = sub nsw <16 x i32> [[TMP20]], [[TMP29]]
-; CHECK-NEXT:    [[TMP31:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <4 x i8> [[TMP31]], <4 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <16 x i8> [[TMP36]], <16 x i8> [[TMP37]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32>
-; CHECK-NEXT:    [[TMP40:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <4 x i8> [[TMP40]], <4 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i8> [[TMP43]], <16 x i8> [[TMP44]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP46:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i8> [[TMP45]], <16 x i8> [[TMP46]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i32>
-; CHECK-NEXT:    [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP39]], [[TMP48]]
-; CHECK-NEXT:    [[TMP50:%.*]] = shl nsw <16 x i32> [[TMP49]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:    [[TMP51:%.*]] = add nsw <16 x i32> [[TMP50]], [[TMP30]]
-; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP51]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP51]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP54:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP56:%.*]] = add nsw <16 x i32> [[TMP53]], [[TMP55]]
-; CHECK-NEXT:    [[TMP57:%.*]] = sub nsw <16 x i32> [[TMP52]], [[TMP54]]
-; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP62:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP61]]
-; CHECK-NEXT:    [[TMP63:%.*]] = sub nsw <16 x i32> [[TMP58]], [[TMP60]]
-; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP66:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP68:%.*]] = add nsw <16 x i32> [[TMP65]], [[TMP67]]
-; CHECK-NEXT:    [[TMP69:%.*]] = sub nsw <16 x i32> [[TMP64]], [[TMP66]]
-; CHECK-NEXT:    [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP73:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> [[TMP69]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP74:%.*]] = add nsw <16 x i32> [[TMP71]], [[TMP73]]
-; CHECK-NEXT:    [[TMP75:%.*]] = sub nsw <16 x i32> [[TMP70]], [[TMP72]]
-; CHECK-NEXT:    [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP74]], <16 x i32> [[TMP75]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP77:%.*]] = lshr <16 x i32> [[TMP76]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP78:%.*]] = and <16 x i32> [[TMP77]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP79:%.*]] = mul nuw <16 x i32> [[TMP78]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP80:%.*]] = add <16 x i32> [[TMP79]], [[TMP76]]
-; CHECK-NEXT:    [[TMP81:%.*]] = xor <16 x i32> [[TMP80]], [[TMP79]]
-; CHECK-NEXT:    [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP81]])
-; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP82]], 65535
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP82]], 16
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32>
+; CHECK-NEXT:    [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32>
+; CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32>
+; CHECK-NEXT:    [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]]
+; CHECK-NEXT:    [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP26]]
+; CHECK-NEXT:    [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> [[TMP43]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> [[TMP43]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP46:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP48:%.*]] = add nsw <16 x i32> [[TMP45]], [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP44]], [[TMP46]]
+; CHECK-NEXT:    [[TMP50:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP53]]
+; CHECK-NEXT:    [[TMP55:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP52]]
+; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP60:%.*]] = add nsw <16 x i32> [[TMP57]], [[TMP59]]
+; CHECK-NEXT:    [[TMP61:%.*]] = sub nsw <16 x i32> [[TMP56]], [[TMP58]]
+; CHECK-NEXT:    [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP66:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP65]]
+; CHECK-NEXT:    [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP62]], [[TMP64]]
+; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP69:%.*]] = lshr <16 x i32> [[TMP68]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP70:%.*]] = and <16 x i32> [[TMP69]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP71:%.*]] = mul nuw <16 x i32> [[TMP70]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP72:%.*]] = add <16 x i32> [[TMP71]], [[TMP68]]
+; CHECK-NEXT:    [[TMP73:%.*]] = xor <16 x i32> [[TMP72]], [[TMP71]]
+; CHECK-NEXT:    [[TMP74:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP73]])
+; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP74]], 65535
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP74]], 16
 ; CHECK-NEXT:    [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]]
 ; CHECK-NEXT:    [[SHR120:%.*]] = lshr i32 [[ADD119]], 1
 ; CHECK-NEXT:    ret i32 [[SHR120]]

>From 7304936479a7eb61adc9edcaf6ce56e4792590ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Thu, 25 Jul 2024 18:49:38 +0200
Subject: [PATCH 168/259] [clang][Interp] Add preliminary __builtin_constant_p
 implementation

This is not perfect or complete, but it helps us pass the simple tests
and those tests where __builtin_constant_p is not the main subject of
testing.
---
 clang/lib/AST/Interp/ByteCodeEmitter.cpp     |  3 +-
 clang/lib/AST/Interp/InterpBuiltin.cpp       | 74 ++++++++++++++++++++
 clang/test/AST/Interp/builtin-constant-p.cpp | 14 ++++
 3 files changed, 90 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/AST/Interp/builtin-constant-p.cpp

diff --git a/clang/lib/AST/Interp/ByteCodeEmitter.cpp b/clang/lib/AST/Interp/ByteCodeEmitter.cpp
index fee4432a8f661..a01fa15dc0b7d 100644
--- a/clang/lib/AST/Interp/ByteCodeEmitter.cpp
+++ b/clang/lib/AST/Interp/ByteCodeEmitter.cpp
@@ -28,7 +28,8 @@ using namespace clang::interp;
 /// but that is not correct for our use cases.
 static bool isUnevaluatedBuiltin(unsigned BuiltinID) {
   return BuiltinID == Builtin::BI__builtin_classify_type ||
-         BuiltinID == Builtin::BI__builtin_os_log_format_buffer_size;
+         BuiltinID == Builtin::BI__builtin_os_log_format_buffer_size ||
+         BuiltinID == Builtin::BI__builtin_constant_p;
 }
 
 Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {
diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index c170042144acc..c59bbc8313edc 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 #include "../ExprConstShared.h"
 #include "Boolean.h"
+#include "Compiler.h"
+#include "EvalEmitter.h"
 #include "Interp.h"
 #include "PrimType.h"
 #include "clang/AST/OSLog.h"
@@ -1127,6 +1129,73 @@ static bool interp__builtin_ptrauth_string_discriminator(
   return true;
 }
 
+// FIXME: This implementation is not complete.
+// The Compiler instance we create cannot access the current stack frame, local
+// variables, function parameters, etc. We also need protection from
+// side-effects, fatal errors, etc.
+static bool interp__builtin_constant_p(InterpState &S, CodePtr OpPC,
+                                       const InterpFrame *Frame,
+                                       const Function *Func,
+                                       const CallExpr *Call) {
+  const Expr *Arg = Call->getArg(0);
+  QualType ArgType = Arg->getType();
+
+  auto returnInt = [&S, Call](bool Value) -> bool {
+    pushInteger(S, Value, Call->getType());
+    return true;
+  };
+
+  // __builtin_constant_p always has one operand. The rules which gcc follows
+  // are not precisely documented, but are as follows:
+  //
+  //  - If the operand is of integral, floating, complex or enumeration type,
+  //    and can be folded to a known value of that type, it returns 1.
+  //  - If the operand can be folded to a pointer to the first character
+  //    of a string literal (or such a pointer cast to an integral type)
+  //    or to a null pointer or an integer cast to a pointer, it returns 1.
+  //
+  // Otherwise, it returns 0.
+  //
+  // FIXME: GCC also intends to return 1 for literals of aggregate types, but
+  // its support for this did not work prior to GCC 9 and is not yet well
+  // understood.
+  if (ArgType->isIntegralOrEnumerationType() || ArgType->isFloatingType() ||
+      ArgType->isAnyComplexType() || ArgType->isPointerType() ||
+      ArgType->isNullPtrType()) {
+    InterpStack Stk;
+    Compiler<EvalEmitter> C(S.Ctx, S.P, S, Stk);
+    auto Res = C.interpretExpr(Arg, /*ConvertResultToRValue=*/Arg->isGLValue());
+    if (Res.isInvalid()) {
+      C.cleanup();
+      Stk.clear();
+    }
+
+    const APValue &LV = Res.toAPValue();
+    if (!Res.isInvalid() && LV.isLValue()) {
+      APValue::LValueBase Base = LV.getLValueBase();
+      if (Base.isNull()) {
+        // A null base is acceptable.
+        return returnInt(true);
+      } else if (const auto *E = Base.dyn_cast<const Expr *>()) {
+        if (!isa<StringLiteral>(E))
+          return returnInt(false);
+        return returnInt(LV.getLValueOffset().isZero());
+      } else if (Base.is<TypeInfoLValue>()) {
+        // Surprisingly, GCC considers __builtin_constant_p(&typeid(int)) to
+        // evaluate to true.
+        return returnInt(true);
+      } else {
+        // Any other base is not constant enough for GCC.
+        return returnInt(false);
+      }
+    }
+
+    return returnInt(!Res.isInvalid() && !Res.empty());
+  }
+
+  return returnInt(false);
+}
+
 bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
                       const CallExpr *Call) {
   const InterpFrame *Frame = S.Current;
@@ -1456,6 +1525,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
       return false;
     break;
 
+  case Builtin::BI__builtin_constant_p:
+    if (!interp__builtin_constant_p(S, OpPC, Frame, F, Call))
+      return false;
+    break;
+
   default:
     S.FFDiag(S.Current->getLocation(OpPC),
              diag::note_invalid_subexpr_in_const_expr)
diff --git a/clang/test/AST/Interp/builtin-constant-p.cpp b/clang/test/AST/Interp/builtin-constant-p.cpp
new file mode 100644
index 0000000000000..0d222d1c96277
--- /dev/null
+++ b/clang/test/AST/Interp/builtin-constant-p.cpp
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s
+// RUN: %clang_cc1 -verify=ref,both %s
+
+
+static_assert(__builtin_constant_p(12), "");
+static_assert(__builtin_constant_p(1.0), "");
+
+constexpr int I = 100;
+static_assert(__builtin_constant_p(I), "");
+static_assert(__builtin_constant_p(I + 10), "");
+static_assert(__builtin_constant_p(I + 10.0), "");
+static_assert(__builtin_constant_p(nullptr), "");
+static_assert(__builtin_constant_p(&I), ""); // both-error {{failed due to requirement}}
+static_assert(__builtin_constant_p((void)I), ""); // both-error {{failed due to requirement}}

>From 3eaf9f7237b088e9382739808d4f723b2c11dd8c Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Thu, 25 Jul 2024 18:08:12 +0100
Subject: [PATCH 169/259] LAA: fix style after cursory reading (NFC) (#100447)

---
 .../llvm/Analysis/LoopAccessAnalysis.h        | 16 ++--
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 83 +++++++++----------
 2 files changed, 49 insertions(+), 50 deletions(-)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index afafb74bdcb0a..cc40d2e83f2e0 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -199,7 +199,8 @@ class MemoryDepChecker {
   /// Check whether the dependencies between the accesses are safe.
   ///
   /// Only checks sets with elements in \p CheckDeps.
-  bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
+  bool areDepsSafe(const DepCandidates &AccessSets,
+                   const MemAccessInfoList &CheckDeps,
                    const DenseMap<Value *, SmallVector<const Value *, 16>>
                        &UnderlyingObjects);
 
@@ -405,14 +406,15 @@ class RuntimePointerChecking;
 struct RuntimeCheckingPtrGroup {
   /// Create a new pointer checking group containing a single
   /// pointer, with index \p Index in RtCheck.
-  RuntimeCheckingPtrGroup(unsigned Index, RuntimePointerChecking &RtCheck);
+  RuntimeCheckingPtrGroup(unsigned Index,
+                          const RuntimePointerChecking &RtCheck);
 
   /// Tries to add the pointer recorded in RtCheck at index
   /// \p Index to this pointer checking group. We can only add a pointer
   /// to a checking group if we will still be able to get
   /// the upper and lower bounds of the check. Returns true in case
   /// of success, false otherwise.
-  bool addPointer(unsigned Index, RuntimePointerChecking &RtCheck);
+  bool addPointer(unsigned Index, const RuntimePointerChecking &RtCheck);
   bool addPointer(unsigned Index, const SCEV *Start, const SCEV *End,
                   unsigned AS, bool NeedsFreeze, ScalarEvolution &SE);
 
@@ -718,8 +720,8 @@ class LoopAccessInfo {
 private:
   /// Analyze the loop. Returns true if all memory access in the loop can be
   /// vectorized.
-  bool analyzeLoop(AAResults *AA, LoopInfo *LI, const TargetLibraryInfo *TLI,
-                   DominatorTree *DT);
+  bool analyzeLoop(AAResults *AA, const LoopInfo *LI,
+                   const TargetLibraryInfo *TLI, DominatorTree *DT);
 
   /// Check if the structure of the loop allows it to be analyzed by this
   /// pass.
@@ -730,8 +732,8 @@ class LoopAccessInfo {
   /// LAA does not directly emits the remarks.  Instead it stores it which the
   /// client can retrieve and presents as its own analysis
   /// (e.g. -Rpass-analysis=loop-vectorize).
-  OptimizationRemarkAnalysis &recordAnalysis(StringRef RemarkName,
-                                             Instruction *Instr = nullptr);
+  OptimizationRemarkAnalysis &
+  recordAnalysis(StringRef RemarkName, const Instruction *Instr = nullptr);
 
   /// Collect memory access with loop invariant strides.
   ///
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 37320720c5329..646d2f7ef3077 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -181,7 +181,7 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
 }
 
 RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
-    unsigned Index, RuntimePointerChecking &RtCheck)
+    unsigned Index, const RuntimePointerChecking &RtCheck)
     : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start),
       AddressSpace(RtCheck.Pointers[Index]
                        .PointerValue->getType()
@@ -280,8 +280,8 @@ bool RuntimePointerChecking::tryToCreateDiffCheck(
   if (CGI.Members.size() != 1 || CGJ.Members.size() != 1)
     return false;
 
-  PointerInfo *Src = &Pointers[CGI.Members[0]];
-  PointerInfo *Sink = &Pointers[CGJ.Members[0]];
+  const PointerInfo *Src = &Pointers[CGI.Members[0]];
+  const PointerInfo *Sink = &Pointers[CGJ.Members[0]];
 
   // If either pointer is read and written, multiple checks may be needed. Bail
   // out.
@@ -385,7 +385,7 @@ SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() {
 
       if (needsChecking(CGI, CGJ)) {
         CanUseDiffCheck = CanUseDiffCheck && tryToCreateDiffCheck(CGI, CGJ);
-        Checks.push_back(std::make_pair(&CGI, &CGJ));
+        Checks.emplace_back(&CGI, &CGJ);
       }
     }
   }
@@ -420,8 +420,8 @@ static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
   return C->getValue()->isNegative() ? J : I;
 }
 
-bool RuntimeCheckingPtrGroup::addPointer(unsigned Index,
-                                         RuntimePointerChecking &RtCheck) {
+bool RuntimeCheckingPtrGroup::addPointer(
+    unsigned Index, const RuntimePointerChecking &RtCheck) {
   return addPointer(
       Index, RtCheck.Pointers[Index].Start, RtCheck.Pointers[Index].End,
       RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(),
@@ -507,7 +507,7 @@ void RuntimePointerChecking::groupChecks(
   // pointers to the same underlying object.
   if (!UseDependencies) {
     for (unsigned I = 0; I < Pointers.size(); ++I)
-      CheckingGroups.push_back(RuntimeCheckingPtrGroup(I, *this));
+      CheckingGroups.emplace_back(I, *this);
     return;
   }
 
@@ -575,7 +575,7 @@ void RuntimePointerChecking::groupChecks(
           // We couldn't add this pointer to any existing set or the threshold
           // for the number of comparisons has been reached. Create a new group
           // to hold the current pointer.
-          Groups.push_back(RuntimeCheckingPtrGroup(Pointer, *this));
+          Groups.emplace_back(Pointer, *this);
       }
     }
 
@@ -605,10 +605,7 @@ bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
     return false;
 
   // Only need to check pointers in the same alias set.
-  if (PointerI.AliasSetId != PointerJ.AliasSetId)
-    return false;
-
-  return true;
+  return PointerI.AliasSetId == PointerJ.AliasSetId;
 }
 
 void RuntimePointerChecking::printChecks(
@@ -658,7 +655,7 @@ class AccessAnalysis {
   typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
   typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
 
-  AccessAnalysis(Loop *TheLoop, AAResults *AA, LoopInfo *LI,
+  AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI,
                  MemoryDepChecker::DepCandidates &DA,
                  PredicatedScalarEvolution &PSE,
                  SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
@@ -669,7 +666,7 @@ class AccessAnalysis {
   }
 
   /// Register a load  and whether it is only read from.
-  void addLoad(MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
+  void addLoad(const MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
     Value *Ptr = const_cast<Value *>(Loc.Ptr);
     AST.add(adjustLoc(Loc));
     Accesses[MemAccessInfo(Ptr, false)].insert(AccessTy);
@@ -678,7 +675,7 @@ class AccessAnalysis {
   }
 
   /// Register a store.
-  void addStore(MemoryLocation &Loc, Type *AccessTy) {
+  void addStore(const MemoryLocation &Loc, Type *AccessTy) {
     Value *Ptr = const_cast<Value *>(Loc.Ptr);
     AST.add(adjustLoc(Loc));
     Accesses[MemAccessInfo(Ptr, true)].insert(AccessTy);
@@ -718,7 +715,7 @@ class AccessAnalysis {
   ///
   /// Note that this can later be cleared if we retry memcheck analysis without
   /// dependency checking (i.e. FoundNonConstantDistanceDependence).
-  bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
+  bool isDependencyCheckNeeded() const { return !CheckDeps.empty(); }
 
   /// We decided that no dependence analysis would be used.  Reset the state.
   void resetDepChecks(MemoryDepChecker &DepChecker) {
@@ -726,10 +723,10 @@ class AccessAnalysis {
     DepChecker.clearDependences();
   }
 
-  MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; }
+  const MemAccessInfoList &getDependenciesToCheck() const { return CheckDeps; }
 
   const DenseMap<Value *, SmallVector<const Value *, 16>> &
-  getUnderlyingObjects() {
+  getUnderlyingObjects() const {
     return UnderlyingObjects;
   }
 
@@ -844,10 +841,8 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE,
     return true;
 
   int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, L, Strides).value_or(0);
-  if (Stride == 1 || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW))
-    return true;
-
-  return false;
+  return Stride == 1 ||
+         PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
 }
 
 static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
@@ -926,7 +921,7 @@ static void findForkedSCEVs(
   unsigned Opcode = I->getOpcode();
   switch (Opcode) {
   case Instruction::GetElementPtr: {
-    GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
+    auto *GEP = cast<GetElementPtrInst>(I);
     Type *SourceTy = GEP->getSourceElementType();
     // We only handle base + single offset GEPs here for now.
     // Not dealing with preexisting gathers yet, so no vectors.
@@ -1081,7 +1076,7 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
   SmallVector<PointerIntPair<const SCEV *, 1, bool>> TranslatedPtrs =
       findForkedPointer(PSE, StridesMap, Ptr, TheLoop);
 
-  for (auto &P : TranslatedPtrs) {
+  for (const auto &P : TranslatedPtrs) {
     const SCEV *PtrExpr = get<0>(P);
     if (!hasComputableBounds(PSE, Ptr, PtrExpr, TheLoop, Assume))
       return false;
@@ -1146,7 +1141,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
   // We assign a consecutive id to access from different alias sets.
   // Accesses between different groups doesn't need to be checked.
   unsigned ASId = 0;
-  for (auto &AS : AST) {
+  for (const auto &AS : AST) {
     int NumReadPtrChecks = 0;
     int NumWritePtrChecks = 0;
     bool CanDoAliasSetRT = true;
@@ -1196,7 +1191,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
                                   ShouldCheckWrap, false)) {
           LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
                             << *Access.getPointer() << '\n');
-          Retries.push_back({Access, AccessTy});
+          Retries.emplace_back(Access, AccessTy);
           CanDoAliasSetRT = false;
         }
       }
@@ -1427,7 +1422,7 @@ static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
   // non-wrapping for the *specific* value of Ptr.
 
   // The arithmetic implied by an inbounds GEP can't overflow.
-  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
   if (!GEP || !GEP->isInBounds())
     return false;
 
@@ -1504,7 +1499,7 @@ std::optional<int64_t> llvm::getPtrStride(PredicatedScalarEvolution &PSE,
     return std::nullopt;
   }
 
-  auto &DL = Lp->getHeader()->getDataLayout();
+  const auto &DL = Lp->getHeader()->getDataLayout();
   TypeSize AllocSize = DL.getTypeAllocSize(AccessTy);
   int64_t Size = AllocSize.getFixedValue();
   const APInt &APStepVal = C->getAPInt();
@@ -1583,8 +1578,10 @@ std::optional<int> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA,
   unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
 
   APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
-  Value *PtrA1 = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
-  Value *PtrB1 = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+  const Value *PtrA1 =
+      PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+  const Value *PtrB1 =
+      PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
 
   int Val;
   if (PtrA1 == PtrB1) {
@@ -1917,10 +1914,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
     const AccessAnalysis::MemAccessInfo &B, Instruction *BInst,
     const DenseMap<Value *, SmallVector<const Value *, 16>>
         &UnderlyingObjects) {
-  auto &DL = InnermostLoop->getHeader()->getDataLayout();
+  const auto &DL = InnermostLoop->getHeader()->getDataLayout();
   auto &SE = *PSE.getSE();
-  auto [APtr, AIsWrite] = A;
-  auto [BPtr, BIsWrite] = B;
+  const auto &[APtr, AIsWrite] = A;
+  const auto &[BPtr, BIsWrite] = B;
 
   // Two reads are independent.
   if (!AIsWrite && !BIsWrite)
@@ -2252,7 +2249,7 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
 }
 
 bool MemoryDepChecker::areDepsSafe(
-    DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
+    const DepCandidates &AccessSets, const MemAccessInfoList &CheckDeps,
     const DenseMap<Value *, SmallVector<const Value *, 16>>
         &UnderlyingObjects) {
 
@@ -2307,7 +2304,7 @@ bool MemoryDepChecker::areDepsSafe(
             // algorithm.
             if (RecordDependences) {
               if (Type != Dependence::NoDep)
-                Dependences.push_back(Dependence(A.second, B.second, Type));
+                Dependences.emplace_back(A.second, B.second, Type);
 
               if (Dependences.size() >= MaxDependences) {
                 RecordDependences = false;
@@ -2397,7 +2394,7 @@ bool LoopAccessInfo::canAnalyzeLoop() {
   return true;
 }
 
-bool LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
+bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
                                  const TargetLibraryInfo *TLI,
                                  DominatorTree *DT) {
   // Holds the Load and Store instructions.
@@ -2638,7 +2635,7 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
       Accesses.canCheckPtrAtRT(*PtrRtChecking, PSE->getSE(), TheLoop,
                                SymbolicStrides, UncomputablePtr, false);
   if (!CanDoRTIfNeeded) {
-    auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
+    const auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
     recordAnalysis("CantIdentifyArrayBounds", I)
         << "cannot identify array bounds";
     LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
@@ -2776,15 +2773,15 @@ bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop,
   assert(TheLoop->contains(BB) && "Unknown block used");
 
   // Blocks that do not dominate the latch need predication.
-  BasicBlock* Latch = TheLoop->getLoopLatch();
+  const BasicBlock *Latch = TheLoop->getLoopLatch();
   return !DT->dominates(BB, Latch);
 }
 
-OptimizationRemarkAnalysis &LoopAccessInfo::recordAnalysis(StringRef RemarkName,
-                                                           Instruction *I) {
+OptimizationRemarkAnalysis &
+LoopAccessInfo::recordAnalysis(StringRef RemarkName, const Instruction *I) {
   assert(!Report && "Multiple reports generated");
 
-  Value *CodeRegion = TheLoop->getHeader();
+  const Value *CodeRegion = TheLoop->getHeader();
   DebugLoc DL = TheLoop->getStartLoc();
 
   if (I) {
@@ -2841,7 +2838,7 @@ static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) {
 /// getGEPInductionOperand. However, if there is some other non-loop-invariant
 /// operand, it returns that instead.
 static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
-  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
   if (!GEP)
     return Ptr;
 
@@ -3078,7 +3075,7 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
 }
 
 const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) {
-  auto [It, Inserted] = LoopAccessInfoMap.insert({&L, nullptr});
+  const auto &[It, Inserted] = LoopAccessInfoMap.insert({&L, nullptr});
 
   if (Inserted)
     It->second =

>From 99bb9a719cec9513e72ad275c1c0302b76b6c408 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs at apple.com>
Date: Thu, 25 Jul 2024 10:13:34 -0700
Subject: [PATCH 170/259] =?UTF-8?q?[compiler-rt][Profile][Darwin]=20Fix=20?=
 =?UTF-8?q?a=20test=20that=20expected=20an=20alignment=20=E2=80=A6=20(#100?=
 =?UTF-8?q?469)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…greater than the maximum for a segment

This test was failing with:
```
--
Exit Code: 1

Command Output (stderr):
--
ld: warning: reducing alignment of section __DATA,__pcnts from 0x4000 to 0x1000 because it exceeds segment maximum alignment ld: warning: reducing alignment of section __DATA,__pdata from 0x4000 to 0x1000 because it exceeds segment maximum alignment __pdata not ordered after __pcnts.

--
```
---
 .../darwin-proof-of-concept.c                 | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/compiler-rt/test/profile/ContinuousSyncMode/darwin-proof-of-concept.c b/compiler-rt/test/profile/ContinuousSyncMode/darwin-proof-of-concept.c
index 85caca9a56b40..3ed7c1894b6d1 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/darwin-proof-of-concept.c
+++ b/compiler-rt/test/profile/ContinuousSyncMode/darwin-proof-of-concept.c
@@ -8,8 +8,8 @@
 
 // Align counters and data to the maximum expected page size (16K).
 // RUN: %clang -g -o %t %s \
-// RUN:   -Wl,-sectalign,__DATA,__pcnts,0x4000 \
-// RUN:   -Wl,-sectalign,__DATA,__pdata,0x4000
+// RUN:   -Wl,-sectalign,__DATA,__pcnts,0x1000 \
+// RUN:   -Wl,-sectalign,__DATA,__pdata,0x1000
 
 // Create a 'profile' using mmap() and validate it.
 // RUN: %run %t create %t.tmpfile
@@ -24,7 +24,7 @@
 
 __attribute__((section("__DATA,__pcnts"))) int counters[] = {0xbad};
 extern int cnts_start __asm("section$start$__DATA$__pcnts");
-const size_t cnts_len = 0x4000;
+const size_t cnts_len = 0x1000;
 
 __attribute__((section("__DATA,__pdata"))) int data[] = {1, 2, 3};
 extern int data_start __asm("section$start$__DATA$__pdata");
@@ -44,8 +44,8 @@ int create_tmpfile(char *path) {
     return EXIT_FAILURE;
   }
 
-  // Write the data first (at offset 0x4000, after the counters).
-  if (data_len != pwrite(fd, &data, data_len, 0x4000)) {
+  // Write the data first (at offset 0x1000, after the counters).
+  if (data_len != pwrite(fd, &data, data_len, cnts_len)) {
     perror("write");
     return EXIT_FAILURE;
   }
@@ -55,8 +55,8 @@ int create_tmpfile(char *path) {
   // Requirements (on Darwin):
   // - &cnts_start must be page-aligned.
   // - The length and offset-into-fd must be page-aligned.
-  int *counter_map = (int *)mmap(&cnts_start, 0x4000, PROT_READ | PROT_WRITE,
-      MAP_FIXED | MAP_SHARED, fd, 0);
+  int *counter_map = (int *)mmap(&cnts_start, cnts_len, PROT_READ | PROT_WRITE,
+                                 MAP_FIXED | MAP_SHARED, fd, 0);
   if (counter_map != &cnts_start) {
     perror("mmap");
     return EXIT_FAILURE;
@@ -97,7 +97,7 @@ int validate_tmpfile(char *path) {
   }
 
   // Verify that the rest of the counters (after counter 9) are 0.
-  const int num_cnts = 0x4000 / sizeof(int);
+  const int num_cnts = cnts_len / sizeof(int);
   for (int i = 10; i < num_cnts; ++i) {
     if (buf[i] != 0) {
       fprintf(stderr,
@@ -131,11 +131,12 @@ int main(int argc, char **argv) {
     fprintf(stderr, "__pcnts is not page-aligned: 0x%lx.\n", cnts_start_int);
     return EXIT_FAILURE;
   }
-  if (data_start_int % pagesz != 0) {
-    fprintf(stderr, "__pdata is not page-aligned: 0x%lx.\n", data_start_int);
+  if (data_start_int % 0x1000 != 0) {
+    fprintf(stderr, "__pdata is not correctly aligned: 0x%lx.\n",
+            data_start_int);
     return EXIT_FAILURE;
   }
-  if (cnts_start_int + 0x4000 != data_start_int) {
+  if (cnts_start_int + 0x1000 != data_start_int) {
     fprintf(stderr, "__pdata not ordered after __pcnts.\n");
     return EXIT_FAILURE;
   }

>From c194bc77a21d68dd321588c9d726ef2d6c556a30 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 25 Jul 2024 18:15:14 +0100
Subject: [PATCH 171/259] [mlir][ArmSME] Add rewrite to handle unsupported SVE
 transposes via SME/ZA (#98620)

This adds a workaround rewrite that allows stores of unsupported SVE
transposes such as:

```mlir
%tr = vector.transpose %vec, [1, 0]
  : vector<2x[4]xf32> to vector<[4]x2xf32>
vector.transfer_write %tr, %dest[%i, %j] {in_bounds = [true, true]}
  : vector<[4]x2xf32>,  memref<?x?xf32>
```

To use SME tiles, which are possible to lower (when SME is available):

```mlir
// Insert vector<2x[4]xf32> into an SME tile:
%0 = arm_sme.get_tile : vector<[4]x[4]xf32>
%1 = vector.extract %vec[0] : vector<[4]xf32> from vector<2x[4]xf32>
%2 = vector.insert %1, %0 [0] : vector<[4]xf32> into vector<[4]x[4]xf32>
%3 = vector.extract %vec[1] : vector<[4]xf32> from vector<2x[4]xf32>
%4 = vector.insert %3, %2 [1] : vector<[4]xf32> into vector<[4]x[4]xf32>
// Store the tile with a transpose + mask:
%c4_vscale = arith.muli %vscale, %c4 : index
%mask = vector.create_mask %c4_vscale, %c2 : vector<[4]x[4]xi1>
vector.transfer_write %4, %arg1[%arg2, %arg3], %mask
   {permutation_map = affine_map<(d0, d1) -> (d1, d0)>}
   : vector<[4]x[4]xf32>, memref<?x?xf32>
```
---
 .../mlir/Dialect/ArmSME/Transforms/Passes.td  |   3 +-
 .../mlir/Dialect/Vector/Utils/VectorUtils.h   |  19 ++
 .../Dialect/ArmSME/Transforms/CMakeLists.txt  |   1 +
 .../ArmSME/Transforms/VectorLegalization.cpp  | 165 ++++++++++++++++--
 .../Dialect/ArmSME/vector-legalization.mlir   | 102 +++++++++++
 5 files changed, 278 insertions(+), 12 deletions(-)

diff --git a/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td b/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td
index dfd64f995546a..921234daad1f1 100644
--- a/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td
@@ -202,7 +202,8 @@ def VectorLegalization
     "func::FuncDialect",
     "arm_sme::ArmSMEDialect",
     "vector::VectorDialect",
-    "arith::ArithDialect"
+    "arith::ArithDialect",
+    "index::IndexDialect"
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
index 9c83acc76e77a..40e04b76593a0 100644
--- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
+++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
@@ -9,6 +9,7 @@
 #ifndef MLIR_DIALECT_VECTOR_UTILS_VECTORUTILS_H_
 #define MLIR_DIALECT_VECTOR_UTILS_VECTORUTILS_H_
 
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
@@ -101,6 +102,24 @@ bool isContiguousSlice(MemRefType memrefType, VectorType vectorType);
 std::optional<StaticTileOffsetRange>
 createUnrollIterator(VectorType vType, int64_t targetRank = 1);
 
+/// Returns a functor (int64_t -> Value) which returns a constant vscale
+/// multiple.
+///
+/// Example:
+/// ```c++
+/// auto createVscaleMultiple = makeVscaleConstantBuilder(rewriter, loc);
+/// auto c4Vscale = createVscaleMultiple(4); // 4 * vector.vscale
+/// ```
+inline auto makeVscaleConstantBuilder(PatternRewriter &rewriter, Location loc) {
+  Value vscale = nullptr;
+  return [loc, vscale, &rewriter](int64_t multiplier) mutable {
+    if (!vscale)
+      vscale = rewriter.create<vector::VectorScaleOp>(loc);
+    return rewriter.create<arith::MulIOp>(
+        loc, vscale, rewriter.create<arith::ConstantIndexOp>(loc, multiplier));
+  };
+}
+
 /// A wrapper for getMixedSizes for vector.transfer_read and
 /// vector.transfer_write Ops (for source and destination, respectively).
 ///
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/CMakeLists.txt b/mlir/lib/Dialect/ArmSME/Transforms/CMakeLists.txt
index 600f2ecdb51bc..8f9b5080e82db 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/ArmSME/Transforms/CMakeLists.txt
@@ -16,6 +16,7 @@ add_mlir_dialect_library(MLIRArmSMETransforms
   MLIRFuncDialect
   MLIRLLVMCommonConversion
   MLIRVectorDialect
+  MLIRIndexDialect
   MLIRSCFDialect
   MLIRSCFTransforms
   MLIRFuncTransforms
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
index 96dad6518fec8..004428bd343e2 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
@@ -18,10 +18,13 @@
 #include "mlir/Dialect/ArmSME/Utils/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Func/Transforms/OneToNFuncConversions.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
 #include "mlir/Transforms/OneToNTypeConversion.h"
 
 #define DEBUG_TYPE "arm-sme-vector-legalization"
@@ -140,11 +143,11 @@ Value extractSMEMask(OpBuilder &builder, Location loc, Value mask,
 auto decomposeToSMETiles(OpBuilder &builder, VectorType type,
                          VectorType smeTileType,
                          bool transposeIndices = false) {
-  assert(isMultipleOfSMETileVectorType(type) &&
-         "`type` not multiple of SME tiles");
   return llvm::map_range(
-      StaticTileOffsetRange(type.getShape(), {smeTileType.getDimSize(0),
-                                              smeTileType.getDimSize(1)}),
+      StaticTileOffsetRange(
+          type.getShape(),
+          {std::min(type.getDimSize(0), smeTileType.getDimSize(0)),
+           std::min(type.getDimSize(1), smeTileType.getDimSize(1))}),
       [=](auto indices) {
         int row = int(indices[0]);
         int col = int(indices[1]);
@@ -440,12 +443,8 @@ struct LegalizeMultiTileTransferWriteAsStoreLoop
                                          kMatchFailureUnsupportedMaskOp);
 
     auto loc = writeOp.getLoc();
-    auto vscale = rewriter.create<vector::VectorScaleOp>(loc);
-    auto createVscaleMultiple = [&](int64_t multiplier) {
-      return rewriter.create<arith::MulIOp>(
-          loc, vscale,
-          rewriter.create<arith::ConstantIndexOp>(loc, multiplier));
-    };
+    auto createVscaleMultiple =
+        vector::makeVscaleConstantBuilder(rewriter, loc);
 
     // Get SME tile and slice types.
     auto smeTileType = getSMETileTypeForElement(vectorType.getElementType());
@@ -775,6 +774,149 @@ struct ConvertIllegalShapeCastOpsToTransposes
   }
 };
 
+/// Rewrites an illegal/unsupported SVE transfer_write(transpose) to instead use
+/// the ZA state. This workaround rewrite to support these transposes when ZA is
+/// available.
+///
+/// Example:
+///
+///  BEFORE:
+///  ```mlir
+///  %transpose = vector.transpose %vec, [1, 0]
+///     : vector<2x[4]xf32> to vector<[4]x2xf32>
+///  vector.transfer_write %transpose, %dest[%y, %x]
+///     : vector<[4]x2xf32>,  memref<?x?xf32>
+///  ```
+///
+///  AFTER:
+///  ```mlir
+///   %0 = arm_sme.get_tile : vector<[4]x[4]xf32>
+///   %1 = vector.extract %vec[0] : vector<[4]xf32> from vector<2x[4]xf32>
+///   %2 = vector.insert %1, %0 [0] : vector<[4]xf32> into vector<[4]x[4]xf32>
+///   %3 = vector.extract %vec[1] : vector<[4]xf32> from vector<2x[4]xf32>
+///   %4 = vector.insert %3, %2 [1] : vector<[4]xf32> into vector<[4]x[4]xf32>
+///   %c4_vscale = arith.muli %vscale, %c4 : index
+///   %mask = vector.create_mask %c4_vscale, %c2 : vector<[4]x[4]xi1>
+///   vector.transfer_write %4, %dest[%y, %x], %mask
+///      {permutation_map = affine_map<(d0, d1) -> (d1, d0)>}
+///      : vector<[4]x[4]xf32>, memref<?x?xf32>
+///  ```
+///
+/// Values larger than a single tile are supported via decomposition.
+struct LowerIllegalTransposeStoreViaZA
+    : public OpRewritePattern<vector::TransferWriteOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
+                                PatternRewriter &rewriter) const override {
+    if (!isSupportedMaskOp(writeOp.getMask()))
+      return rewriter.notifyMatchFailure(writeOp,
+                                         kMatchFailureUnsupportedMaskOp);
+
+    auto permutationMap = writeOp.getPermutationMap();
+    if (!permutationMap.isIdentity())
+      return rewriter.notifyMatchFailure(writeOp,
+                                         kMatchFailureNonPermutationMap);
+
+    auto transposeOp = writeOp.getVector().getDefiningOp<vector::TransposeOp>();
+    if (!transposeOp)
+      return failure();
+
+    auto sourceType = transposeOp.getSourceVectorType();
+    auto resultType = transposeOp.getResultVectorType();
+
+    if (resultType.getRank() != 2)
+      return rewriter.notifyMatchFailure(transposeOp, "TransposeOp not rank 2");
+
+    if (!isLegalVectorType(sourceType) || isLegalVectorType(resultType))
+      return rewriter.notifyMatchFailure(
+          transposeOp, "not illegal/unsupported SVE transpose");
+
+    auto smeTileType = getSMETileTypeForElement(resultType.getElementType());
+    VectorType smeSliceType = VectorType::Builder(smeTileType).dropDim(0);
+
+    if (sourceType.getDimSize(0) <= 1 ||
+        sourceType.getDimSize(1) % smeSliceType.getDimSize(0) != 0)
+      return rewriter.notifyMatchFailure(writeOp, "unsupported source shape");
+
+    auto loc = writeOp.getLoc();
+    auto createVscaleMultiple =
+        vector::makeVscaleConstantBuilder(rewriter, loc);
+
+    auto transposeMap = AffineMapAttr::get(
+        AffineMap::getPermutationMap(ArrayRef<int64_t>{1, 0}, getContext()));
+
+    // Note: We need to use `get_tile` as there's no vector-level `undef`.
+    Value undefTile = rewriter.create<arm_sme::GetTileOp>(loc, smeTileType);
+    Value destTensorOrMemref = writeOp.getSource();
+    auto numSlicesPerTile =
+        std::min(sourceType.getDimSize(0), smeTileType.getDimSize(0));
+    auto numSlices =
+        rewriter.create<arith::ConstantIndexOp>(loc, numSlicesPerTile);
+    for (auto [index, smeTile] : llvm::enumerate(
+             decomposeToSMETiles(rewriter, sourceType, smeTileType))) {
+      // 1. _Deliberately_ drop a scalable dimension and insert a fixed number
+      // of slices from the source type into the SME tile. Without checking
+      // vscale (and emitting multiple implementations) we can't make use of the
+      // rows of the tile after 1*vscale rows.
+      Value tile = undefTile;
+      for (int d = 0; d < numSlicesPerTile; ++d) {
+        Value vector = rewriter.create<vector::ExtractOp>(
+            loc, transposeOp.getVector(),
+            rewriter.getIndexAttr(d + smeTile.row));
+        if (vector.getType() != smeSliceType) {
+          vector = rewriter.create<vector::ScalableExtractOp>(
+              loc, smeSliceType, vector, smeTile.col);
+        }
+        tile = rewriter.create<vector::InsertOp>(loc, vector, tile, d);
+      }
+
+      // 2. Transpose the tile position.
+      auto transposedRow = createVscaleMultiple(smeTile.col);
+      auto transposedCol =
+          rewriter.create<arith::ConstantIndexOp>(loc, smeTile.row);
+
+      // 3. Compute mask for tile store.
+      Value maskRows;
+      Value maskCols;
+      if (auto mask = writeOp.getMask()) {
+        auto createMask = mask.getDefiningOp<vector::CreateMaskOp>();
+        maskRows = rewriter.create<arith::SubIOp>(loc, createMask.getOperand(0),
+                                                  transposedRow);
+        maskCols = rewriter.create<arith::SubIOp>(loc, createMask.getOperand(1),
+                                                  transposedCol);
+        maskCols = rewriter.create<index::MinSOp>(loc, maskCols, numSlices);
+      } else {
+        maskRows = createVscaleMultiple(smeTileType.getDimSize(0));
+        maskCols = numSlices;
+      }
+      auto subMask = rewriter.create<vector::CreateMaskOp>(
+          loc, smeTileType.clone(rewriter.getI1Type()),
+          ValueRange{maskRows, maskCols});
+
+      // 4. Emit a transposed tile write.
+      auto writeIndices = writeOp.getIndices();
+      Value destRow =
+          rewriter.create<arith::AddIOp>(loc, transposedRow, writeIndices[0]);
+      Value destCol =
+          rewriter.create<arith::AddIOp>(loc, transposedCol, writeIndices[1]);
+      auto smeWrite = rewriter.create<vector::TransferWriteOp>(
+          loc, tile, destTensorOrMemref, ValueRange{destRow, destCol},
+          transposeMap, subMask, writeOp.getInBounds());
+
+      if (writeOp.hasPureTensorSemantics())
+        destTensorOrMemref = smeWrite.getResult();
+    }
+
+    if (writeOp.hasPureTensorSemantics())
+      rewriter.replaceOp(writeOp, destTensorOrMemref);
+    else
+      rewriter.eraseOp(writeOp);
+
+    return success();
+  }
+};
+
 struct VectorLegalizationPass
     : public arm_sme::impl::VectorLegalizationBase<VectorLegalizationPass> {
   void runOnOperation() override {
@@ -796,7 +938,8 @@ struct VectorLegalizationPass
 
     patterns.add<FoldExtractFromVectorOfSMELikeCreateMasks,
                  LiftIllegalVectorTransposeToMemory,
-                 ConvertIllegalShapeCastOpsToTransposes>(context);
+                 ConvertIllegalShapeCastOpsToTransposes,
+                 LowerIllegalTransposeStoreViaZA>(context);
     // Note: These two patterns are added with a high benefit to ensure:
     //  - Masked outer products are handled before unmasked ones
     //  - Multi-tile writes are lowered as a store loop (if possible)
diff --git a/mlir/test/Dialect/ArmSME/vector-legalization.mlir b/mlir/test/Dialect/ArmSME/vector-legalization.mlir
index 71d80bc16ea12..458906a187982 100644
--- a/mlir/test/Dialect/ArmSME/vector-legalization.mlir
+++ b/mlir/test/Dialect/ArmSME/vector-legalization.mlir
@@ -544,3 +544,105 @@ func.func @multi_tile_splat() -> vector<[8]x[8]xi32>
   %0 = arith.constant dense<42> : vector<[8]x[8]xi32>
   return %0 : vector<[8]x[8]xi32>
 }
+
+// -----
+
+// CHECK: #[[$TRANSPOSE_MAP_0:.*]] = affine_map<(d0, d1) -> (d1, d0)>
+
+// CHECK-LABEL: @transpose_store_scalable_via_za(
+// CHECK-SAME:                                   %[[VEC:.*]]: vector<2x[4]xf32>
+// CHECK-SAME:                                   %[[DEST:.*]]: memref<?x?xf32>,
+// CHECK-SAME:                                   %[[I:.*]]: index,
+// CHECK-SAME:                                   %[[J:.*]]: index)
+func.func @transpose_store_scalable_via_za(%vec: vector<2x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
+  // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
+  // CHECK-NEXT: %[[INIT:.*]] = arm_sme.get_tile : vector<[4]x[4]xf32>
+  // CHECK-NEXT: %[[V0:.*]] = vector.extract %[[VEC]][0] : vector<[4]xf32> from vector<2x[4]xf32>
+  // CHECK-NEXT: %[[R0:.*]] = vector.insert %[[V0]], %[[INIT]] [0] : vector<[4]xf32> into vector<[4]x[4]xf32>
+  // CHECK-NEXT: %[[V1:.*]] = vector.extract %[[VEC]][1] : vector<[4]xf32> from vector<2x[4]xf32>
+  // CHECK-NEXT: %[[RES:.*]] = vector.insert %[[V1]], %[[R0]] [1] : vector<[4]xf32> into vector<[4]x[4]xf32>
+  // CHECK-NEXT: %[[VSCALE:.*]] = vector.vscale
+  // CHECK-NEXT: %[[C4_VSCALE:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
+  // CHECK-NEXT: %[[MASK:.*]] = vector.create_mask %[[C4_VSCALE]], %[[C2]] : vector<[4]x[4]xi1>
+  // CHECK-NEXT: vector.transfer_write %[[RES]], %[[DEST]][%[[I]], %[[J]]], %[[MASK]] {in_bounds = [true, true], permutation_map = #[[$TRANSPOSE_MAP_0]]} : vector<[4]x[4]xf32>, memref<?x?xf32>
+  %tr = vector.transpose %vec, [1, 0] : vector<2x[4]xf32> to vector<[4]x2xf32>
+  vector.transfer_write %tr, %dest[%i, %j] {in_bounds = [true, true]} : vector<[4]x2xf32>,  memref<?x?xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @transpose_store_scalable_via_za_masked(
+// CHECK-SAME:                                          %[[A:[a-z0-9]+]]: index,
+// CHECK-SAME:                                          %[[B:[a-z0-9]+]]: index)
+func.func @transpose_store_scalable_via_za_masked(%vec: vector<2x[4]xf32>, %dest: memref<?x?xf32>, %a: index, %b: index) {
+  // CHECK: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK: %[[MIN:.*]] = index.mins %[[B]], %[[C2]]
+  // CHECK: %[[MASK:.*]] = vector.create_mask %[[A]], %[[MIN]] : vector<[4]x[4]xi1>
+  // CHECK: vector.transfer_write {{.*}} %[[MASK]] {{.*}} : vector<[4]x[4]xf32>, memref<?x?xf32>
+  %c0 = arith.constant 0 : index
+  %mask = vector.create_mask %a, %b : vector<[4]x2xi1>
+  %tr = vector.transpose %vec, [1, 0] : vector<2x[4]xf32> to vector<[4]x2xf32>
+  vector.transfer_write %tr, %dest[%c0, %c0], %mask {in_bounds = [true, true]} : vector<[4]x2xf32>,  memref<?x?xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @transpose_store_scalable_via_za_multi_tile(
+// CHECK-SAME:                                              %[[VEC:.*]]: vector<8x[4]xf32>
+// CHECK-SAME:                                              %[[DEST:.*]]: memref<?x?xf32>,
+// CHECK-SAME:                                              %[[I:.*]]: index,
+// CHECK-SAME:                                              %[[J:.*]]: index)
+func.func @transpose_store_scalable_via_za_multi_tile(%vec: vector<8x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
+  // CHECK: %[[C4:.*]] = arith.constant 4 : index
+
+  // <skip 3x other extract+insert chain>
+  // CHECK: %[[V3:.*]] = vector.extract %[[VEC]][3] : vector<[4]xf32> from vector<8x[4]xf32>
+  // CHECK: %[[TILE_0:.*]] = vector.insert %[[V3]], %{{.*}} [3] : vector<[4]xf32> into vector<[4]x[4]xf32>
+  // CHECK: %[[VSCALE:.*]] = vector.vscale
+  // CHECK: %[[C4_VSCALE:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
+  // CHECK: %[[MASK:.*]] = vector.create_mask %c4_vscale, %c4 : vector<[4]x[4]xi1>
+  // CHECK: vector.transfer_write %[[TILE_0]], %[[DEST]][%[[I]], %[[J]]], %[[MASK]] {{.*}} : vector<[4]x[4]xf32>, memref<?x?xf32>
+
+  // <skip 3x other extract+insert chain>
+  // CHECK: %[[V7:.*]] = vector.extract %arg0[7] : vector<[4]xf32> from vector<8x[4]xf32>
+  // CHECK: %[[TILE_1:.*]] = vector.insert %[[V7]], %{{.*}} [3] : vector<[4]xf32> into vector<[4]x[4]xf32>
+  // CHECK: %[[J_OFFSET:.*]] = arith.addi %[[J]], %[[C4]] : index
+  // CHECK: vector.transfer_write %[[TILE_1]], %[[DEST]][%[[I]], %[[J_OFFSET]]], %[[MASK]] {{.*}} : vector<[4]x[4]xf32>, memref<?x?xf32>
+  %tr = vector.transpose %vec, [1, 0] : vector<8x[4]xf32> to vector<[4]x8xf32>
+  vector.transfer_write %tr, %dest[%i, %j] {in_bounds = [true, true]} : vector<[4]x8xf32>,  memref<?x?xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @transpose_store_scalable_via_za_multi_tile_wide
+func.func @transpose_store_scalable_via_za_multi_tile_wide(%vec: vector<2x[8]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
+  // <check extracts from lower 4 x vscale of %vec>
+  // CHECK: vector.scalable.extract
+  // CHECK: %[[ROW_2_LOWER:.*]] = vector.scalable.extract %{{.*}}[0] : vector<[4]xf32> from vector<[8]xf32>
+  // CHECK: %[[TILE_0:.*]] = vector.insert %[[ROW_2_LOWER]], %{{.*}}[1] : vector<[4]xf32> into vector<[4]x[4]xf32>
+  // CHECK: vector.transfer_write %[[TILE_0]], %{{.*}}[%[[I:.[a-z0-9]+]], %[[J:[a-z0-9]+]]]
+
+  // <check extracts from upper 4 x vscale of %vec>
+  // CHECK: vector.scalable.extract
+  // CHECK: %[[ROW_2_UPPER:.*]] = vector.scalable.extract %{{.*}}[4] : vector<[4]xf32> from vector<[8]xf32>
+  // CHECK: %[[TILE_0:.*]] = vector.insert %[[ROW_2_UPPER]], %{{.*}}[1] : vector<[4]xf32> into vector<[4]x[4]xf32>
+  // CHECK: %[[I_OFFSET:.*]] = arith.addi %c4_vscale, %[[I]] : index
+  // CHECK: vector.transfer_write %[[TILE_0]], %{{.*}}[%[[I_OFFSET]], %[[J]]]
+  %tr = vector.transpose %vec, [1, 0] : vector<2x[8]xf32> to vector<[8]x2xf32>
+  vector.transfer_write %tr, %dest[%i, %j] {in_bounds = [true, true]} : vector<[8]x2xf32>,  memref<?x?xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @negative_transpose_store_scalable_via_za__bad_source_shape
+// CHECK-NOT: arm_sme.get_tile
+func.func @negative_transpose_store_scalable_via_za__bad_source_shape(%vec: vector<2x[7]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
+  %tr = vector.transpose %vec, [1, 0] : vector<2x[7]xf32> to vector<[7]x2xf32>
+  vector.transfer_write %tr, %dest[%i, %j] {in_bounds = [true, true]} : vector<[7]x2xf32>,  memref<?x?xf32>
+  return
+}

>From a55df237375e98cfc2520d5eb1a23b302ef02ba0 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Thu, 25 Jul 2024 12:16:48 -0500
Subject: [PATCH 172/259] [libc++] Add missing xlocale.h include on Apple and
 FreeBSD (#99689)

The `<locale>` header uses `strtoll_l` and friends which are defined in
`<xlocale.h>` on these platforms. While this works via transitive
includes when modules are disabled, this doesn't work anymore if the
platforms are modularized properly.
---
 libcxx/include/locale | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libcxx/include/locale b/libcxx/include/locale
index dbec23a2c936d..573910a85bef5 100644
--- a/libcxx/include/locale
+++ b/libcxx/include/locale
@@ -232,6 +232,10 @@ template <class charT> class messages_byname;
 #    include <__locale_dir/locale_base_api/bsd_locale_fallbacks.h>
 #  endif
 
+#  if defined(__APPLE__) || defined(__FreeBSD__)
+#    include <xlocale.h>
+#  endif
+
 #  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #    pragma GCC system_header
 #  endif

>From db8c84fc7a75dd60bcfff7160b51e1a55e7e0f73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Thu, 25 Jul 2024 19:21:28 +0200
Subject: [PATCH 173/259] [GlobalIsel] Push cast through select. (#100539)

---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |  11 ++
 .../CodeGen/GlobalISel/GenericMachineInstrs.h |  16 +++
 .../include/llvm/Target/GlobalISel/Combine.td |  17 ++-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  10 ++
 .../GlobalISel/CombinerHelperCasts.cpp        |  48 +++++++
 .../AArch64/GlobalISel/combine-cast.mir       | 131 ++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll   |  38 +++--
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   |   4 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll    |   8 +-
 llvm/test/CodeGen/AMDGPU/ctlz.ll              |   2 +-
 llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll   |   9 +-
 llvm/test/CodeGen/AMDGPU/cttz.ll              |   2 +-
 12 files changed, 263 insertions(+), 33 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 47365c3be3b93..05d7e882f5135 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -129,6 +129,12 @@ class CombinerHelper {
 
   const TargetLowering &getTargetLowering() const;
 
+  const MachineFunction &getMachineFunction() const;
+
+  const DataLayout &getDataLayout() const;
+
+  LLVMContext &getContext() const;
+
   /// \returns true if the combiner is running pre-legalization.
   bool isPreLegalize() const;
 
@@ -884,6 +890,9 @@ class CombinerHelper {
   bool matchTruncateOfExt(const MachineInstr &Root, const MachineInstr &ExtMI,
                           BuildFnTy &MatchInfo);
 
+  bool matchCastOfSelect(const MachineInstr &Cast, const MachineInstr &SelectMI,
+                         BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
@@ -996,6 +1005,8 @@ class CombinerHelper {
 
   // Simplify (cmp cc0 x, y) (&& or ||) (cmp cc1 x, y) -> cmp cc2 x, y.
   bool tryFoldLogicOfFCmps(GLogicalBinOp *Logic, BuildFnTy &MatchInfo);
+
+  bool isCastFree(unsigned Opcode, LLT ToTy, LLT FromTy) const;
 };
 } // namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 8b7e8c0fbf1f5..ef1171d9f1f64 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -934,6 +934,22 @@ class GExtOp : public GCastOp {
   };
 };
 
+/// Represents an integer-like extending or truncating operation.
+class GExtOrTruncOp : public GCastOp {
+public:
+  static bool classof(const MachineInstr *MI) {
+    switch (MI->getOpcode()) {
+    case TargetOpcode::G_SEXT:
+    case TargetOpcode::G_ZEXT:
+    case TargetOpcode::G_ANYEXT:
+    case TargetOpcode::G_TRUNC:
+      return true;
+    default:
+      return false;
+    }
+  };
+};
+
 } // namespace llvm
 
 #endif // LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 2362e77b54be2..2246e20ecc1dc 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1771,10 +1771,25 @@ def truncate_of_zext : truncate_of_opcode<G_ZEXT>;
 def truncate_of_sext : truncate_of_opcode<G_SEXT>;
 def truncate_of_anyext : truncate_of_opcode<G_ANYEXT>;
 
+// Push cast through select.
+class select_of_opcode<Instruction castOpcode> : GICombineRule <
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (G_SELECT $select, $cond, $true, $false):$Select,
+         (castOpcode $root, $select):$Cast,
+         [{ return Helper.matchCastOfSelect(*${Cast}, *${Select}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${Cast}, ${matchinfo}); }])>;
+
+def select_of_zext : select_of_opcode<G_ZEXT>;
+def select_of_anyext : select_of_opcode<G_ANYEXT>;
+def select_of_truncate : select_of_opcode<G_TRUNC>;
+
 def cast_combines: GICombineGroup<[
   truncate_of_zext,
   truncate_of_sext,
-  truncate_of_anyext
+  truncate_of_anyext,
+  select_of_zext,
+  select_of_anyext,
+  select_of_truncate
 ]>;
 
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 8c05931812af5..d930ab2984629 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -68,6 +68,16 @@ const TargetLowering &CombinerHelper::getTargetLowering() const {
   return *Builder.getMF().getSubtarget().getTargetLowering();
 }
 
+const MachineFunction &CombinerHelper::getMachineFunction() const {
+  return Builder.getMF();
+}
+
+const DataLayout &CombinerHelper::getDataLayout() const {
+  return getMachineFunction().getDataLayout();
+}
+
+LLVMContext &CombinerHelper::getContext() const { return Builder.getContext(); }
+
 /// \returns The little endian in-memory byte position of byte \p I in a
 /// \p ByteWidth bytes wide type.
 ///
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
index d36685bf28313..59295f7a65835 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
@@ -161,3 +161,51 @@ bool CombinerHelper::matchTruncateOfExt(const MachineInstr &Root,
 
   return false;
 }
+
+bool CombinerHelper::isCastFree(unsigned Opcode, LLT ToTy, LLT FromTy) const {
+  const TargetLowering &TLI = getTargetLowering();
+  const DataLayout &DL = getDataLayout();
+  LLVMContext &Ctx = getContext();
+
+  switch (Opcode) {
+  case TargetOpcode::G_ANYEXT:
+  case TargetOpcode::G_ZEXT:
+    return TLI.isZExtFree(FromTy, ToTy, DL, Ctx);
+  case TargetOpcode::G_TRUNC:
+    return TLI.isTruncateFree(FromTy, ToTy, DL, Ctx);
+  default:
+    return false;
+  }
+}
+
+bool CombinerHelper::matchCastOfSelect(const MachineInstr &CastMI,
+                                       const MachineInstr &SelectMI,
+                                       BuildFnTy &MatchInfo) {
+  const GExtOrTruncOp *Cast = cast<GExtOrTruncOp>(&CastMI);
+  const GSelect *Select = cast<GSelect>(&SelectMI);
+
+  if (!MRI.hasOneNonDBGUse(Select->getReg(0)))
+    return false;
+
+  Register Dst = Cast->getReg(0);
+  LLT DstTy = MRI.getType(Dst);
+  LLT CondTy = MRI.getType(Select->getCondReg());
+  Register TrueReg = Select->getTrueReg();
+  Register FalseReg = Select->getFalseReg();
+  LLT SrcTy = MRI.getType(TrueReg);
+  Register Cond = Select->getCondReg();
+
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SELECT, {DstTy, CondTy}}))
+    return false;
+
+  if (!isCastFree(Cast->getOpcode(), DstTy, SrcTy))
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    auto True = B.buildInstr(Cast->getOpcode(), {DstTy}, {TrueReg});
+    auto False = B.buildInstr(Cast->getOpcode(), {DstTy}, {FalseReg});
+    B.buildSelect(Dst, Cond, True, False);
+  };
+
+  return true;
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir
new file mode 100644
index 0000000000000..0f436127ea2eb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir
@@ -0,0 +1,131 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs  %s | FileCheck %s --check-prefixes=CHECK,CHECK-PRE
+# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-postlegalizer-combiner -verify-machineinstrs  %s | FileCheck %s --check-prefixes=CHECK,CHECK-POST
+
+---
+name:            test_combine_trunc_select
+legalized: true
+body:             |
+  bb.1:
+    ; CHECK-PRE-LABEL: name: test_combine_trunc_select
+    ; CHECK-PRE: %cond:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: %lhs:_(s64) = COPY $x0
+    ; CHECK-PRE-NEXT: %rhs:_(s64) = COPY $x0
+    ; CHECK-PRE-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64)
+    ; CHECK-PRE-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %rhs(s64)
+    ; CHECK-PRE-NEXT: %small:_(s32) = G_SELECT %cond(s32), [[TRUNC]], [[TRUNC1]]
+    ; CHECK-PRE-NEXT: $w0 = COPY %small(s32)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_trunc_select
+    ; CHECK-POST: %cond:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: %lhs:_(s64) = COPY $x0
+    ; CHECK-POST-NEXT: %rhs:_(s64) = COPY $x0
+    ; CHECK-POST-NEXT: %res:_(s64) = G_SELECT %cond(s32), %lhs, %rhs
+    ; CHECK-POST-NEXT: %small:_(s32) = G_TRUNC %res(s64)
+    ; CHECK-POST-NEXT: $w0 = COPY %small(s32)
+    %cond:_(s32) = COPY $w0
+    %lhs:_(s64) = COPY $x0
+    %rhs:_(s64) = COPY $x0
+    %res:_(s64) = G_SELECT %cond(s32), %lhs, %rhs
+    %small:_(s32) = G_TRUNC %res(s64)
+    $w0 = COPY %small(s32)
+...
+---
+name:            test_combine_zext_select
+legalized: true
+body:             |
+  bb.1:
+    ; CHECK-PRE-LABEL: name: test_combine_zext_select
+    ; CHECK-PRE: %cond:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: %lhs:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: %rhs:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %lhs(s32)
+    ; CHECK-PRE-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT %rhs(s32)
+    ; CHECK-PRE-NEXT: %big:_(s64) = G_SELECT %cond(s32), [[ZEXT]], [[ZEXT1]]
+    ; CHECK-PRE-NEXT: $x0 = COPY %big(s64)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_zext_select
+    ; CHECK-POST: %cond:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: %lhs:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: %rhs:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: %res:_(s32) = G_SELECT %cond(s32), %lhs, %rhs
+    ; CHECK-POST-NEXT: %big:_(s64) = G_ZEXT %res(s32)
+    ; CHECK-POST-NEXT: $x0 = COPY %big(s64)
+    %cond:_(s32) = COPY $w0
+    %lhs:_(s32) = COPY $w0
+    %rhs:_(s32) = COPY $w0
+    %res:_(s32) = G_SELECT %cond(s32), %lhs, %rhs
+    %big:_(s64) = G_ZEXT %res(s32)
+    $x0 = COPY %big(s64)
+...
+---
+name:            test_combine_anyzext_select
+legalized: true
+body:             |
+  bb.1:
+    ; CHECK-PRE-LABEL: name: test_combine_anyzext_select
+    ; CHECK-PRE: %cond:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: %lhs:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: %rhs:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT %lhs(s32)
+    ; CHECK-PRE-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT %rhs(s32)
+    ; CHECK-PRE-NEXT: %big:_(s64) = G_SELECT %cond(s32), [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK-PRE-NEXT: $x0 = COPY %big(s64)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_anyzext_select
+    ; CHECK-POST: %cond:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: %lhs:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: %rhs:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: %res:_(s32) = G_SELECT %cond(s32), %lhs, %rhs
+    ; CHECK-POST-NEXT: %big:_(s64) = G_ANYEXT %res(s32)
+    ; CHECK-POST-NEXT: $x0 = COPY %big(s64)
+    %cond:_(s32) = COPY $w0
+    %lhs:_(s32) = COPY $w0
+    %rhs:_(s32) = COPY $w0
+    %res:_(s32) = G_SELECT %cond(s32), %lhs, %rhs
+    %big:_(s64) = G_ANYEXT %res(s32)
+    $x0 = COPY %big(s64)
+...
+---
+name:            test_combine_anyzext_select_multi_use
+legalized: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_anyzext_select_multi_use
+    ; CHECK: %cond:_(s32) = COPY $w0
+    ; CHECK-NEXT: %lhs:_(s32) = COPY $w0
+    ; CHECK-NEXT: %rhs:_(s32) = COPY $w0
+    ; CHECK-NEXT: %res:_(s32) = G_SELECT %cond(s32), %lhs, %rhs
+    ; CHECK-NEXT: %big:_(s64) = G_ANYEXT %res(s32)
+    ; CHECK-NEXT: $x0 = COPY %big(s64)
+    ; CHECK-NEXT: $w0 = COPY %res(s32)
+    %cond:_(s32) = COPY $w0
+    %lhs:_(s32) = COPY $w0
+    %rhs:_(s32) = COPY $w0
+    %res:_(s32) = G_SELECT %cond(s32), %lhs, %rhs
+    %big:_(s64) = G_ANYEXT %res(s32)
+    $x0 = COPY %big(s64)
+    $w0 = COPY %res(s32)
+...
+---
+name:            test_combine_trunc_select_vector_out_of_budget
+legalized: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_trunc_select_vector_out_of_budget
+    ; CHECK: %cond:_(<2 x s32>) = COPY $x0
+    ; CHECK-NEXT: %arg1:_(s64) = COPY $x0
+    ; CHECK-NEXT: %arg2:_(s64) = COPY $x0
+    ; CHECK-NEXT: %bv:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+    ; CHECK-NEXT: %bv2:_(<2 x s64>) = G_BUILD_VECTOR %arg2(s64), %arg1(s64)
+    ; CHECK-NEXT: %res:_(<2 x s64>) = G_SELECT %cond(<2 x s32>), %bv, %bv2
+    ; CHECK-NEXT: %small:_(<2 x s32>) = G_TRUNC %res(<2 x s64>)
+    ; CHECK-NEXT: $x0 = COPY %small(<2 x s32>)
+    %cond:_(<2 x s32>) = COPY $x0
+    %arg1:_(s64) = COPY $x0
+    %arg2:_(s64) = COPY $x0
+    %bv:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+    %bv2:_(<2 x s64>) = G_BUILD_VECTOR %arg2(s64), %arg1(s64)
+    %res:_(<2 x s64>) = G_SELECT %cond(<2 x s32>), %bv, %bv2
+    %small:_(<2 x s32>) = G_TRUNC %res(<2 x s64>)
+    $x0 = COPY %small(<2 x s32>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index ec832ed0f7f3a..63f5464371cc6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -1845,39 +1845,37 @@ define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) {
 ; GCN-NEXT:    s_lshr_b64 s[2:3], s[0:1], s3
 ; GCN-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
 ; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GCN-NEXT:    s_ashr_i32 s8, s5, 31
+; GCN-NEXT:    s_ashr_i32 s7, s5, 31
 ; GCN-NEXT:    s_ashr_i64 s[4:5], s[4:5], s10
 ; GCN-NEXT:    s_cmp_lg_u32 s11, 0
 ; GCN-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
 ; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_mov_b32 s9, s8
 ; GCN-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[8:9]
+; GCN-NEXT:    s_cselect_b32 s2, s6, s7
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_ashr_i65:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_bfe_i64 s[4:5], s[2:3], 0x10000
-; GFX10PLUS-NEXT:    s_sub_i32 s12, s3, 64
-; GFX10PLUS-NEXT:    s_sub_i32 s8, 64, s3
+; GFX10PLUS-NEXT:    s_sub_i32 s10, s3, 64
+; GFX10PLUS-NEXT:    s_sub_i32 s2, 64, s3
 ; GFX10PLUS-NEXT:    s_cmp_lt_u32 s3, 64
-; GFX10PLUS-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10PLUS-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX10PLUS-NEXT:    s_cmp_eq_u32 s3, 0
-; GFX10PLUS-NEXT:    s_cselect_b32 s14, 1, 0
-; GFX10PLUS-NEXT:    s_ashr_i64 s[6:7], s[4:5], s3
-; GFX10PLUS-NEXT:    s_lshr_b64 s[2:3], s[0:1], s3
-; GFX10PLUS-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
-; GFX10PLUS-NEXT:    s_ashr_i32 s10, s5, 31
-; GFX10PLUS-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10PLUS-NEXT:    s_ashr_i64 s[4:5], s[4:5], s12
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX10PLUS-NEXT:    s_mov_b32 s11, s10
-; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[10:11]
+; GFX10PLUS-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10PLUS-NEXT:    s_lshr_b64 s[6:7], s[0:1], s3
+; GFX10PLUS-NEXT:    s_lshl_b64 s[8:9], s[4:5], s2
+; GFX10PLUS-NEXT:    s_ashr_i64 s[2:3], s[4:5], s3
+; GFX10PLUS-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10PLUS-NEXT:    s_ashr_i32 s3, s5, 31
+; GFX10PLUS-NEXT:    s_ashr_i64 s[4:5], s[4:5], s10
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10PLUS-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, s2, s3
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = ashr i65 %value, %amount
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 980ba3da4bac7..5dd4fa0809131 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1766,7 +1766,7 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
 ; GCN-NEXT:    s_cmp_lg_u32 s12, 0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GCN-NEXT:    s_cselect_b32 s2, s6, 0
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_lshr_i65:
@@ -1788,7 +1788,7 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
 ; GFX10PLUS-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, s2, 0
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i65 %value, %amount
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index c2f911cc44587..4cf1c92539c36 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -1733,9 +1733,9 @@ define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) {
 ; GCN-NEXT:    s_lshl_b64 s[8:9], s[0:1], s10
 ; GCN-NEXT:    s_cmp_lg_u32 s11, 0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
-; GCN-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[8:9]
+; GCN-NEXT:    s_cselect_b32 s3, s6, s8
 ; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT:    s_cselect_b32 s2, s2, s3
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_shl_i65:
@@ -1753,9 +1753,9 @@ define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) {
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[6:7], s[0:1], s10
 ; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], s[8:9], 0
-; GFX10PLUS-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX10PLUS-NEXT:    s_cselect_b32 s3, s4, s6
 ; GFX10PLUS-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, s2, s3
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = shl i65 %value, %amount
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index ba0a1e75e29b7..a0b549711f339 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -1593,7 +1593,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index a55c8cdc9b6e8..2168e7fe1dd28 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -1706,11 +1706,12 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v0
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa s[0:1], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1, s[0:1]
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v3, v3
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa vcc, v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index 57fe6cd4e1e45..14e6c4bcf6d8f 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -1359,7 +1359,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x100, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_sdwa s0, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, s0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0xffff, s0
 ; GFX10-GISEL-NEXT:    global_store_byte v2, v0, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

>From edf9756e2e658ac8faa8635e15acb428d5b103f6 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis at chromium.org>
Date: Thu, 25 Jul 2024 13:38:50 -0400
Subject: [PATCH 174/259] [gn] port abc2eae6829 (bolt standalone)

---
 .../gn/secondary/bolt/lib/RuntimeLibs/BUILD.gn  |  4 ++++
 llvm/utils/gn/secondary/bolt/lib/Utils/BUILD.gn | 17 +++++++++++++++++
 llvm/utils/gn/secondary/bolt/test/BUILD.gn      |  2 ++
 3 files changed, 23 insertions(+)

diff --git a/llvm/utils/gn/secondary/bolt/lib/RuntimeLibs/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/RuntimeLibs/BUILD.gn
index 701df4d0c8597..750f73630b7d5 100644
--- a/llvm/utils/gn/secondary/bolt/lib/RuntimeLibs/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/lib/RuntimeLibs/BUILD.gn
@@ -18,4 +18,8 @@ static_library("RuntimeLibs") {
     "InstrumentationRuntimeLibrary.cpp",
     "RuntimeLibrary.cpp",
   ]
+
+  # FIXME: Hopefully change this, see
+  # https://github.com/llvm/llvm-project/pull/97130/files#r1691863361
+  defines = [ "CMAKE_INSTALL_FULL_LIBDIR=\"\"" ]
 }
diff --git a/llvm/utils/gn/secondary/bolt/lib/Utils/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Utils/BUILD.gn
index 6a70a90815c8c..8517ff43be47b 100644
--- a/llvm/utils/gn/secondary/bolt/lib/Utils/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/lib/Utils/BUILD.gn
@@ -1,7 +1,24 @@
+import("//llvm/utils/gn/build/write_vcsrevision.gni")
+
+# Configure the VCSVersion.inc file
+config("write_vcsrevision_config") {
+  # To pick up the generated inc file.
+  include_dirs = [ target_gen_dir ]
+  visibility = [ ":write_vcsversion" ]
+}
+
+write_vcsrevision("write_vcsversion") {
+  visibility = [ ":Utils" ]
+  header = "$target_gen_dir/VCSVersion.inc"
+  names = [ "LLDB" ]
+  public_configs = [ ":write_vcsrevision_config" ]
+}
+
 static_library("Utils") {
   output_name = "LLVMBOLTUtils"
   configs += [ "//llvm/utils/gn/build:bolt_code" ]
   deps = [
+    ":write_vcsversion",
     "//llvm/lib/Support",
     "//llvm/utils/gn/build/libs/pthread",
   ]
diff --git a/llvm/utils/gn/secondary/bolt/test/BUILD.gn b/llvm/utils/gn/secondary/bolt/test/BUILD.gn
index d8ef560624897..17072a8d47737 100644
--- a/llvm/utils/gn/secondary/bolt/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/test/BUILD.gn
@@ -45,6 +45,8 @@ write_lit_config("lit_site_cfg") {
     "BOLT_ENABLE_RUNTIME=0",  # FIXME: enable runtime
     "BOLT_TARGETS_TO_BUILD=$bolt_targets_to_build_string",
     "GNU_LD_EXECUTABLE=",  # FIXME: set sometimes?
+    "LIBBOLT_RT_HUGIFY=",
+    "LIBBOLT_RT_INSTR=",
     "LLVM_HOST_TRIPLE=$llvm_current_triple",
     "LLVM_USE_SANITIZER=",
     "Python3_EXECUTABLE=$python_path",

>From 08b3556c2ff8948932bcbbc77781fe41388e4e63 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis at chromium.org>
Date: Thu, 25 Jul 2024 13:46:52 -0400
Subject: [PATCH 175/259] [gn] port enough of 2914a4b88837 (lldb Interfaces)

gn's lldb does not have the Python plugin, so most of the changes
from 2914a4b88837 aren't necessary here.
---
 llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn | 1 +
 .../lldb/source/Interpreter/Interfaces/BUILD.gn          | 9 +++++++++
 2 files changed, 10 insertions(+)
 create mode 100644 llvm/utils/gn/secondary/lldb/source/Interpreter/Interfaces/BUILD.gn

diff --git a/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn
index 7825396b402d5..c6c6ef994c28d 100644
--- a/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn
@@ -15,6 +15,7 @@ static_library("Interpreter") {
   deps = [
     ":InterpreterProperties",
     ":InterpreterPropertiesEnum",
+    "Interfaces",
     "//lldb/source/Commands",
     "//lldb/source/Core",
     "//lldb/source/DataFormatters",
diff --git a/llvm/utils/gn/secondary/lldb/source/Interpreter/Interfaces/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Interpreter/Interfaces/BUILD.gn
new file mode 100644
index 0000000000000..2e70c54ea9564
--- /dev/null
+++ b/llvm/utils/gn/secondary/lldb/source/Interpreter/Interfaces/BUILD.gn
@@ -0,0 +1,9 @@
+static_library("Interfaces") {
+  output_name = "lldbInterpreterInterfaces"
+  configs += [ "//llvm/utils/gn/build:lldb_code" ]
+  deps = [
+    "//lldb/source/Utility",
+    "//llvm/lib/Support",
+  ]
+  sources = [ "ScriptedInterfaceUsages.cpp" ]
+}

>From 319d29d21ea9f04cd8530c77b558796bf5db0197 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Thu, 25 Jul 2024 10:53:03 -0700
Subject: [PATCH 176/259] [Sema] Fix a warning

This patch fixes:

  clang/lib/Sema/SemaChecking.cpp:8220:3: error: default label in
  switch which covers all enumeration values
  [-Werror,-Wcovered-switch-default]
---
 clang/lib/Sema/SemaChecking.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 99a1471d9d06d..bde92e04b6b83 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -8217,9 +8217,8 @@ static bool IsInfOrNanFunction(StringRef calleeName, MathCheck Check) {
   case MathCheck::Inf:
     return MatchesAny({"__builtin_inf", "__builtin_inff", "__builtin_infl",
                        "__builtin_inff16", "__builtin_inff128"});
-  default:
-    return false;
   }
+  llvm_unreachable("unknown MathCheck");
 }
 
 void Sema::CheckInfNaNFunction(const CallExpr *Call,

>From e31794f99d72dd764c4bc5c5583a0a4c89df22c3 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Thu, 25 Jul 2024 18:54:24 +0100
Subject: [PATCH 177/259] [StackFrameLayoutAnalysis] Support more SlotTypes
 (#100562)

Add new SlotTypes to StackFrameLayoutAnalysis to disambiguate Fixed and
Variable-Sized stack slots from Variable slots. As Offsets are
unreliable for VLA-area objects, sort these to the end of the list -
using the Frame Index to ensure a deterministic order when Offsets are
equal.
---
 .../CodeGen/StackFrameLayoutAnalysisPass.cpp  | 27 ++++++++++++++++---
 .../CodeGen/AArch64/sve-stack-frame-layout.ll | 25 ++++++++++-------
 .../CodeGen/X86/stack-frame-layout-remarks.ll | 12 ++++-----
 3 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
index ff77685f8f354..0a7a6bad4e86d 100644
--- a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
+++ b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
@@ -51,6 +51,8 @@ struct StackFrameLayoutAnalysisPass : public MachineFunctionPass {
 
   enum SlotType {
     Spill,          // a Spill slot
+    Fixed,          // a Fixed slot (e.g. arguments passed on the stack)
+    VariableSized,  // a variable sized object
     StackProtector, // Stack Protector slot
     Variable,       // a slot used to store a local data (could be a tmp)
     Invalid         // It's an error for a slot to have this type
@@ -72,17 +74,30 @@ struct StackFrameLayoutAnalysisPass : public MachineFunctionPass {
       Scalable = MFI.getStackID(Idx) == TargetStackID::ScalableVector;
       if (MFI.isSpillSlotObjectIndex(Idx))
         SlotTy = SlotType::Spill;
-      else if (Idx == MFI.getStackProtectorIndex())
+      else if (MFI.isFixedObjectIndex(Idx))
+        SlotTy = SlotType::Fixed;
+      else if (MFI.isVariableSizedObjectIndex(Idx))
+        SlotTy = SlotType::VariableSized;
+      else if (MFI.hasStackProtectorIndex() &&
+               Idx == MFI.getStackProtectorIndex())
         SlotTy = SlotType::StackProtector;
       else
         SlotTy = SlotType::Variable;
     }
 
+    bool isVarSize() const { return SlotTy == SlotType::VariableSized; }
+
     // We use this to sort in reverse order, so that the layout is displayed
-    // correctly.
+    // correctly. Variable sized slots are sorted to the end of the list, as
+    // offsets are currently incorrect for these but they reside at the end of
+    // the stack frame. The Slot index is used to ensure deterministic order
+    // when offsets are equal.
     bool operator<(const SlotData &Rhs) const {
-      return (Offset.getFixed() + Offset.getScalable()) >
-             (Rhs.Offset.getFixed() + Rhs.Offset.getScalable());
+      return std::make_tuple(!isVarSize(),
+                             Offset.getFixed() + Offset.getScalable(), Slot) >
+             std::make_tuple(!Rhs.isVarSize(),
+                             Rhs.Offset.getFixed() + Rhs.Offset.getScalable(),
+                             Rhs.Slot);
     }
   };
 
@@ -121,6 +136,10 @@ struct StackFrameLayoutAnalysisPass : public MachineFunctionPass {
     switch (Ty) {
     case SlotType::Spill:
       return "Spill";
+    case SlotType::Fixed:
+      return "Fixed";
+    case SlotType::VariableSized:
+      return "VariableSized";
     case SlotType::StackProtector:
       return "Protector";
     case SlotType::Variable:
diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
index 36bca2ebd4ada..431c9dc76508f 100644
--- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
@@ -147,10 +147,11 @@ entry:
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Variable, Align: 1, Size: 0
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Variable, Align: 16, Size: vscale x 16
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-40-16 x vscale], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: VariableSized, Align: 1, Size: 0
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: VariableSized, Align: 1, Size: 0
 
 define i32 @csr_d8_allocnxv4i32i32f64_vla(double %d, i32 %i) "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: csr_d8_allocnxv4i32i32f64_vla:
@@ -172,7 +173,10 @@ define i32 @csr_d8_allocnxv4i32i32f64_vla(double %d, i32 %i) "aarch64_pstate_sm_
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    add x8, x8, #15
 ; CHECK-NEXT:    and x8, x8, #0x7fffffff0
-; CHECK-NEXT:    sub x8, x9, x8
+; CHECK-NEXT:    sub x9, x9, x8
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    sub x8, x10, x8
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    mov z1.s, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.s
@@ -181,8 +185,9 @@ define i32 @csr_d8_allocnxv4i32i32f64_vla(double %d, i32 %i) "aarch64_pstate_sm_
 ; CHECK-NEXT:    str wzr, [x8]
 ; CHECK-NEXT:    sub x8, x29, #8
 ; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    str d0, [x19, #8]
+; CHECK-NEXT:    str wzr, [x9]
 ; CHECK-NEXT:    st1w { z1.s }, p0, [x8, #-1, mul vl]
+; CHECK-NEXT:    str d0, [x19, #8]
 ; CHECK-NEXT:    sub sp, x29, #8
 ; CHECK-NEXT:    ldp x29, x30, [sp, #8] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x19, [sp, #24] // 8-byte Folded Reload
@@ -191,18 +196,20 @@ define i32 @csr_d8_allocnxv4i32i32f64_vla(double %d, i32 %i) "aarch64_pstate_sm_
 entry:
   %a = alloca <vscale x 4 x i32>
   %0 = zext i32 %i to i64
-  %b = alloca i32, i64 %0
+  %vla0 = alloca i32, i64 %0
+  %vla1 = alloca i32, i64 %0
   %c = alloca double
   tail call void asm sideeffect "", "~{d8}"() #1
   store <vscale x 4 x i32> zeroinitializer, ptr %a
-  store i32 zeroinitializer, ptr %b
+  store i32 zeroinitializer, ptr %vla0
+  store i32 zeroinitializer, ptr %vla1
   store double %d, ptr %c
   ret i32 0
 }
 
 ; CHECK-FRAMELAYOUT-LABEL: Function: csr_d8_allocnxv4i32i32f64_stackargsi32f64
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP+8], Type: Variable, Align: 8, Size: 4
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP+0], Type: Protector, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP+8], Type: Fixed, Align: 8, Size: 4
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP+0], Type: Fixed, Align: 16, Size: 8
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-16 x vscale], Type: Variable, Align: 16, Size: vscale x 16
@@ -289,7 +296,7 @@ entry:
 }
 
 ; CHECK-FRAMELAYOUT-LABEL: Function: svecc_z8_allocnxv4i32i32f64_stackargsi32_fp
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP+0], Type: Protector, Align: 16, Size: 4
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP+0], Type: Fixed, Align: 16, Size: 4
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-16 x vscale], Type: Spill, Align: 16, Size: vscale x 16
@@ -514,7 +521,7 @@ declare ptr @memset(ptr, i32, i32)
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-104], Type: Spill, Align: 8, Size: 8
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-112], Type: Spill, Align: 8, Size: 8
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-128], Type: Variable, Align: 16, Size: 16
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-128], Type: Variable, Align: 16, Size: 0
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-128], Type: VariableSized, Align: 16, Size: 0
 
 define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "target-features"="+sme" {
 ; CHECK-LABEL: vastate:
diff --git a/llvm/test/CodeGen/X86/stack-frame-layout-remarks.ll b/llvm/test/CodeGen/X86/stack-frame-layout-remarks.ll
index cd5edcf2ae502..d8ce5b041042e 100644
--- a/llvm/test/CodeGen/X86/stack-frame-layout-remarks.ll
+++ b/llvm/test/CodeGen/X86/stack-frame-layout-remarks.ll
@@ -35,7 +35,7 @@ entry:
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
 
 ; BOTH: Function: cleanup_array
-; BOTH-NEXT:  Offset: [SP+4], Type: Protector, Align: 16, Size: 4
+; BOTH-NEXT:  Offset: [SP+4], Type: Fixed, Align: 16, Size: 4
 ; DEBUG: a @ dot.c:13
 ; STRIPPED-NOT: a @ dot.c:13
 ; BOTH:  Offset: [SP-4], Type: Spill, Align: 8, Size: 4
@@ -47,7 +47,7 @@ define void @cleanup_array(ptr %0) #1 {
 }
 
 ; BOTH: Function: cleanup_result
-; BOTH:  Offset: [SP+4], Type: Protector, Align: 16, Size: 4
+; BOTH:  Offset: [SP+4], Type: Fixed, Align: 16, Size: 4
 ; DEBUG: res @ dot.c:21
 ; STRIPPED-NOT: res @ dot.c:21
 ; BOTH:  Offset: [SP-4], Type: Spill, Align: 8, Size: 4
@@ -59,11 +59,11 @@ define void @cleanup_result(ptr %0) #1 {
 }
 
 ; BOTH: Function: do_work
-; BOTH:  Offset: [SP+12], Type: Variable, Align: 8, Size: 4
+; BOTH:  Offset: [SP+12], Type: Fixed, Align: 8, Size: 4
 ; DEBUG: out @ dot.c:32
 ; STRIPPED-NOT: out @ dot.c:32
-; BOTH:  Offset: [SP+8], Type: Variable, Align: 4, Size: 4
-; BOTH:  Offset: [SP+4], Type: Protector, Align: 16, Size: 4
+; BOTH:  Offset: [SP+8], Type: Fixed, Align: 4, Size: 4
+; BOTH:  Offset: [SP+4], Type: Fixed, Align: 16, Size: 4
 ; DEBUG: A @ dot.c:32
 ; STRIPPED-NOT: A @ dot.c:32
 ; BOTH:  Offset: [SP-4], Type: Spill, Align: 8, Size: 4
@@ -125,7 +125,7 @@ define i32 @do_work(ptr %0, ptr %1, ptr %2) #2 {
 }
 
 ; BOTH: Function: gen_array
-; BOTH:  Offset: [SP+4], Type: Protector, Align: 16, Size: 4
+; BOTH:  Offset: [SP+4], Type: Fixed, Align: 16, Size: 4
 ; DEBUG: size @ dot.c:62
 ; STRIPPED-NOT: size @ dot.c:62
 ; BOTH:  Offset: [SP-4], Type: Spill, Align: 8, Size: 4

>From 88fb56ebf25de6288510364a4be3fbd7ae9a7246 Mon Sep 17 00:00:00 2001
From: "Mikhail R. Gadelha" <mikhail at igalia.com>
Date: Thu, 25 Jul 2024 19:56:16 +0200
Subject: [PATCH 178/259] [libc] Fix broken table introduced by PR #100578

---
 libc/docs/date_and_time.rst | 120 ++++++++++++++++++------------------
 1 file changed, 60 insertions(+), 60 deletions(-)

diff --git a/libc/docs/date_and_time.rst b/libc/docs/date_and_time.rst
index 6207375375217..db77d2c5ba61a 100644
--- a/libc/docs/date_and_time.rst
+++ b/libc/docs/date_and_time.rst
@@ -46,64 +46,64 @@ Implementation Status
 
   - barebone-riscv32 - to be added
 
-+---------------------+---------------------------------------+-------------------+-------------------+-------------------+-------------------+
-| <Func>              |  Linux                                | Windows           | MacOS             | Embedded          | GPU               |
-|                     +---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
++---------------------+-----------------------------------------------+-------------------+-------------------+-------------------+-------------------+
+| <Func>              |  Linux                                        | Windows           | MacOS             | Embedded          | GPU               |
+|                     +---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
 |                     | x86_64  | aarch64 | aarch32 | riscv64/riscv32 | x86_64  | aarch64 | x86_64  | aarch64 | aarch32 | riscv32 | AMD     | nVidia  |
-+=====================+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+
-| asctime             | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| asctime_r           | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| clock               | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| clock_getcpuclockid |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| clock_getres        |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| clock_gettime       | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| clock_nanosleep     |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| clock_settime       |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ctime               |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ctime_r             |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| difftime            | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| getdate             |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| gettimeofday        | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| gmtime              | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| gmtime_r            | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| localtime           |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| localtime_r         |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| mktime              | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nanosleep           | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| strftime            |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| strptime            |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| time                | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| timer_create        |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| timer_delete        |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| timer_gettime       |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| timer_getoverrun    |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| timer_settime       |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| tzset               |         |         |         |         |         |         |         |         |         |         |         |         |
-+---------------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
++=====================+=========+=========+=========+=================+=========+=========+=========+=========+=========+=========+=========+=========+
+| asctime             | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| asctime_r           | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| clock               | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| clock_getcpuclockid |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| clock_getres        |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| clock_gettime       | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| clock_nanosleep     |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| clock_settime       |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| ctime               |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| ctime_r             |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| difftime            | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| getdate             |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| gettimeofday        | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| gmtime              | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| gmtime_r            | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| localtime           |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| localtime_r         |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| mktime              | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nanosleep           | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| strftime            |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| strptime            |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| time                | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| timer_create        |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| timer_delete        |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| timer_gettime       |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| timer_getoverrun    |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| timer_settime       |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+
+| tzset               |         |         |         |                 |         |         |         |         |         |         |         |         |
++---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+

>From b365dbbd8dcc8cd3732995dd95b380f77b3337e3 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail at justinbogner.com>
Date: Thu, 25 Jul 2024 11:22:04 -0700
Subject: [PATCH 179/259] [DXIL][Analysis] Move dxil::ResourceInfo to the
 Analysis library. NFC

I had put this in Transforms/Utils, but that doesn't actually make
sense if we want to populate these structures via an analysis pass.

Pull Request: https://github.com/llvm/llvm-project/pull/100621
---
 .../llvm/{Transforms/Utils => Analysis}/DXILResource.h    | 8 ++++----
 llvm/lib/Analysis/CMakeLists.txt                          | 1 +
 llvm/lib/{Transforms/Utils => Analysis}/DXILResource.cpp  | 4 ++--
 llvm/lib/Transforms/Utils/CMakeLists.txt                  | 1 -
 llvm/unittests/Analysis/CMakeLists.txt                    | 1 +
 .../{Transforms/Utils => Analysis}/DXILResourceTest.cpp   | 2 +-
 llvm/unittests/Transforms/Utils/CMakeLists.txt            | 1 -
 7 files changed, 9 insertions(+), 9 deletions(-)
 rename llvm/include/llvm/{Transforms/Utils => Analysis}/DXILResource.h (97%)
 rename llvm/lib/{Transforms/Utils => Analysis}/DXILResource.cpp (99%)
 rename llvm/unittests/{Transforms/Utils => Analysis}/DXILResourceTest.cpp (99%)

diff --git a/llvm/include/llvm/Transforms/Utils/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h
similarity index 97%
rename from llvm/include/llvm/Transforms/Utils/DXILResource.h
rename to llvm/include/llvm/Analysis/DXILResource.h
index df01fb457977a..ac1cefd98dbe3 100644
--- a/llvm/include/llvm/Transforms/Utils/DXILResource.h
+++ b/llvm/include/llvm/Analysis/DXILResource.h
@@ -1,4 +1,4 @@
-//===- DXILResource.h - Tools to translate DXIL resources -------*- C++ -*-===//
+//===- DXILResource.h - Representations of DXIL resources -------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_UTILS_DXILRESOURCE_H
-#define LLVM_TRANSFORMS_UTILS_DXILRESOURCE_H
+#ifndef LLVM_ANALYSIS_DXILRESOURCE_H
+#define LLVM_ANALYSIS_DXILRESOURCE_H
 
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Value.h"
@@ -188,4 +188,4 @@ class ResourceInfo {
 } // namespace dxil
 } // namespace llvm
 
-#endif // LLVM_TRANSFORMS_UTILS_DXILRESOURCE_H
+#endif // LLVM_ANALYSIS_DXILRESOURCE_H
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 74476cb5440c6..997bb7a0bb178 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -59,6 +59,7 @@ add_llvm_component_library(LLVMAnalysis
   DomPrinter.cpp
   DomTreeUpdater.cpp
   DominanceFrontier.cpp
+  DXILResource.cpp
   FunctionPropertiesAnalysis.cpp
   GlobalsModRef.cpp
   GuardUtils.cpp
diff --git a/llvm/lib/Transforms/Utils/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
similarity index 99%
rename from llvm/lib/Transforms/Utils/DXILResource.cpp
rename to llvm/lib/Analysis/DXILResource.cpp
index de2b6512a6d1c..cbe634c4b91aa 100644
--- a/llvm/lib/Transforms/Utils/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -1,4 +1,4 @@
-//===- DXILResource.cpp - Tools to translate DXIL resources ---------------===//
+//===- DXILResource.cpp - Representations of DXIL resources ---------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/DXILResource.h"
+#include "llvm/Analysis/DXILResource.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/IR/DerivedTypes.h"
 
diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
index 1b811c7cebef9..51e8821773c3a 100644
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -20,7 +20,6 @@ add_llvm_component_library(LLVMTransformUtils
   CountVisits.cpp
   Debugify.cpp
   DemoteRegToStack.cpp
-  DXILResource.cpp
   DXILUpgrade.cpp
   EntryExitInstrumenter.cpp
   EscapeEnumerator.cpp
diff --git a/llvm/unittests/Analysis/CMakeLists.txt b/llvm/unittests/Analysis/CMakeLists.txt
index b1aeaa6e71fd4..3cba630867a83 100644
--- a/llvm/unittests/Analysis/CMakeLists.txt
+++ b/llvm/unittests/Analysis/CMakeLists.txt
@@ -24,6 +24,7 @@ set(ANALYSIS_TEST_SOURCES
   ConstraintSystemTest.cpp
   DDGTest.cpp
   DomTreeUpdaterTest.cpp
+  DXILResourceTest.cpp
   GlobalsModRefTest.cpp
   FunctionPropertiesAnalysisTest.cpp
   InlineCostTest.cpp
diff --git a/llvm/unittests/Transforms/Utils/DXILResourceTest.cpp b/llvm/unittests/Analysis/DXILResourceTest.cpp
similarity index 99%
rename from llvm/unittests/Transforms/Utils/DXILResourceTest.cpp
rename to llvm/unittests/Analysis/DXILResourceTest.cpp
index 1e494cbb7da92..036ba40f275d0 100644
--- a/llvm/unittests/Transforms/Utils/DXILResourceTest.cpp
+++ b/llvm/unittests/Analysis/DXILResourceTest.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/DXILResource.h"
+#include "llvm/Analysis/DXILResource.h"
 #include "llvm/IR/Constants.h"
 #include "gtest/gtest.h"
 
diff --git a/llvm/unittests/Transforms/Utils/CMakeLists.txt b/llvm/unittests/Transforms/Utils/CMakeLists.txt
index 8a14a5b8e249e..35055baa05ee9 100644
--- a/llvm/unittests/Transforms/Utils/CMakeLists.txt
+++ b/llvm/unittests/Transforms/Utils/CMakeLists.txt
@@ -18,7 +18,6 @@ add_llvm_unittest(UtilsTests
   CodeLayoutTest.cpp
   CodeMoverUtilsTest.cpp
   DebugifyTest.cpp
-  DXILResourceTest.cpp
   FunctionComparatorTest.cpp
   IntegerDivisionTest.cpp
   LocalTest.cpp

>From 3c210740649a538d2fc2432395f2ef41c56dc8b6 Mon Sep 17 00:00:00 2001
From: Daniel Thornburgh <dthorn at google.com>
Date: Thu, 25 Jul 2024 11:23:38 -0700
Subject: [PATCH 180/259] [NFC][libc][malloc] Refactor Block (#100445)

This decreases the surface area of the block implementation in
preparation for deeper changes to its implementation.

See #98096

- Remove dead member functions.
- Remove last() check from next(), as described in its comment.
- Rework object lifetimes such that only block headers are actually
considered live. This simplifies their implementation.
- The allocated storage becomes live at the outer call to malloc-family
functions via a special case in the C++ standard.
- Add asserts for flag properties required by the implementation.
- Remove static from member functions that don't invalidate the block.
---
 libc/src/__support/block.h             | 181 +++++++----------------
 libc/src/__support/freelist_heap.h     |   6 +-
 libc/test/src/__support/block_test.cpp | 193 ++++---------------------
 3 files changed, 82 insertions(+), 298 deletions(-)

diff --git a/libc/src/__support/block.h b/libc/src/__support/block.h
index 86cb4bd7ad582..6e7186f3224b9 100644
--- a/libc/src/__support/block.h
+++ b/libc/src/__support/block.h
@@ -159,13 +159,10 @@ class Block {
     return reinterpret_cast<const cpp::byte *>(this) + BLOCK_OVERHEAD;
   }
 
-  /// Marks the block as free and merges it with any free neighbors.
-  ///
-  /// This method is static in order to consume and replace the given block
-  /// pointer. If neither member is free, the returned pointer will point to the
-  /// original block. Otherwise, it will point to the new, larger block created
-  /// by merging adjacent free blocks together.
-  static void free(Block *&block);
+  // @returns The region of memory the block manages, including the header.
+  ByteSpan region() {
+    return {reinterpret_cast<cpp::byte *>(this), outer_size()};
+  }
 
   /// Attempts to split this block.
   ///
@@ -176,75 +173,34 @@ class Block {
   /// This method may fail if the remaining space is too small to hold a new
   /// block. If this method fails for any reason, the original block is
   /// unmodified.
-  ///
-  /// This method is static in order to consume and replace the given block
-  /// pointer with a pointer to the new, smaller block.
-  static optional<Block *> split(Block *&block, size_t new_inner_size);
+  optional<Block *> split(size_t new_inner_size);
 
   /// Merges this block with the one that comes after it.
-  ///
-  /// This method is static in order to consume and replace the given block
-  /// pointer with a pointer to the new, larger block.
-  static bool merge_next(Block *&block);
+  bool merge_next();
 
-  /// Fetches the block immediately after this one.
-  ///
-  /// For performance, this always returns a block pointer, even if the returned
-  /// pointer is invalid. The pointer is valid if and only if `last()` is false.
-  ///
-  /// Typically, after calling `Init` callers may save a pointer past the end of
-  /// the list using `next()`. This makes it easy to subsequently iterate over
-  /// the list:
-  /// @code{.cpp}
-  ///   auto result = Block<>::init(byte_span);
-  ///   Block<>* begin = *result;
-  ///   Block<>* end = begin->next();
-  ///   ...
-  ///   for (auto* block = begin; block != end; block = block->next()) {
-  ///     // Do something which each block.
-  ///   }
-  /// @endcode
+  /// @returns The block immediately after this one, or a null pointer if this
+  /// is the last block.
   Block *next() const;
 
-  /// @copydoc `next`.
-  static Block *next_block(const Block *block) {
-    return block == nullptr ? nullptr : block->next();
-  }
-
   /// @returns The block immediately before this one, or a null pointer if this
   /// is the first block.
   Block *prev() const;
 
-  /// @copydoc `prev`.
-  static Block *prev_block(const Block *block) {
-    return block == nullptr ? nullptr : block->prev();
-  }
-
   /// Indicates whether the block is in use.
   ///
   /// @returns `true` if the block is in use or `false` if not.
   bool used() const { return next_ & USED_MASK; }
 
-  /// Indicates whether this block is the last block or not (i.e. whether
-  /// `next()` points to a valid block or not). This is needed because
-  /// `next()` points to the end of this block, whether there is a valid
-  /// block there or not.
-  ///
-  /// @returns `true` is this is the last block or `false` if not.
-  bool last() const { return next_ & LAST_MASK; }
-
   /// Marks this block as in use.
   void mark_used() { next_ |= USED_MASK; }
 
   /// Marks this block as free.
   void mark_free() { next_ &= ~USED_MASK; }
 
-  /// Marks this block as the last one in the chain.
+  /// Marks this block as the last one in the chain. Makes next() return
+  /// nullptr.
   constexpr void mark_last() { next_ |= LAST_MASK; }
 
-  /// Clears the last bit from this block.
-  void clear_last() { next_ &= ~LAST_MASK; }
-
   /// @brief Checks if a block is valid.
   ///
   /// @returns `true` if and only if the following conditions are met:
@@ -314,10 +270,8 @@ class Block {
   static BlockInfo allocate(Block *block, size_t alignment, size_t size);
 
 private:
-  /// Consumes the block and returns as a span of bytes.
-  static ByteSpan as_bytes(Block *&&block);
-
-  /// Consumes the span of bytes and uses it to construct and return a block.
+  /// Construct a block to represent a span of bytes. Overwrites only enough
+  /// memory for the block header; the rest of the span is left alone.
   static Block *as_block(size_t prev_outer_size, ByteSpan bytes);
 
   /// Returns a `BlockStatus` that is either VALID or indicates the reason why
@@ -329,7 +283,7 @@ class Block {
 
   /// Like `split`, but assumes the caller has already checked to parameters to
   /// ensure the split will succeed.
-  static Block *split_impl(Block *&block, size_t new_inner_size);
+  Block *split_impl(size_t new_inner_size);
 
   /// Offset from this block to the previous block. 0 if this is the first
   /// block.
@@ -389,20 +343,6 @@ Block<OffsetType, kAlign>::init(ByteSpan region) {
   return block;
 }
 
-template <typename OffsetType, size_t kAlign>
-void Block<OffsetType, kAlign>::free(Block *&block) {
-  if (block == nullptr)
-    return;
-
-  block->mark_free();
-  Block *prev = block->prev();
-
-  if (merge_next(prev))
-    block = prev;
-
-  merge_next(block);
-}
-
 template <typename OffsetType, size_t kAlign>
 bool Block<OffsetType, kAlign>::can_allocate(size_t alignment,
                                              size_t size) const {
@@ -436,7 +376,7 @@ Block<OffsetType, kAlign>::allocate(Block *block, size_t alignment,
 
     Block *original = info.block;
     optional<Block *> maybe_aligned_block =
-        Block::split(original, adjustment - BLOCK_OVERHEAD);
+        original->split(adjustment - BLOCK_OVERHEAD);
     LIBC_ASSERT(maybe_aligned_block.has_value() &&
                 "This split should always result in a new block. The check in "
                 "`can_allocate` ensures that we have enough space here to make "
@@ -445,7 +385,7 @@ Block<OffsetType, kAlign>::allocate(Block *block, size_t alignment,
     if (Block *prev = original->prev()) {
       // If there is a block before this, we can merge the current one with the
       // newly created one.
-      merge_next(prev);
+      prev->merge_next();
     } else {
       // Otherwise, this was the very first block in the chain. Now we can make
       // it the new first block.
@@ -459,7 +399,7 @@ Block<OffsetType, kAlign>::allocate(Block *block, size_t alignment,
   }
 
   // Now get a block for the requested size.
-  if (optional<Block *> next = Block::split(info.block, size))
+  if (optional<Block *> next = info.block->split(size))
     info.next = *next;
 
   return info;
@@ -467,14 +407,11 @@ Block<OffsetType, kAlign>::allocate(Block *block, size_t alignment,
 
 template <typename OffsetType, size_t kAlign>
 optional<Block<OffsetType, kAlign> *>
-Block<OffsetType, kAlign>::split(Block *&block, size_t new_inner_size) {
-  if (block == nullptr)
+Block<OffsetType, kAlign>::split(size_t new_inner_size) {
+  if (used())
     return {};
 
-  if (block->used())
-    return {};
-
-  size_t old_inner_size = block->inner_size();
+  size_t old_inner_size = inner_size();
   new_inner_size = align_up(new_inner_size, ALIGNMENT);
   if (old_inner_size < new_inner_size)
     return {};
@@ -482,61 +419,56 @@ Block<OffsetType, kAlign>::split(Block *&block, size_t new_inner_size) {
   if (old_inner_size - new_inner_size < BLOCK_OVERHEAD)
     return {};
 
-  return split_impl(block, new_inner_size);
+  return split_impl(new_inner_size);
 }
 
 template <typename OffsetType, size_t kAlign>
 Block<OffsetType, kAlign> *
-Block<OffsetType, kAlign>::split_impl(Block *&block, size_t new_inner_size) {
-  size_t prev_outer_size = block->prev_;
+Block<OffsetType, kAlign>::split_impl(size_t new_inner_size) {
   size_t outer_size1 = new_inner_size + BLOCK_OVERHEAD;
-  bool is_last = block->last();
-  ByteSpan bytes = as_bytes(cpp::move(block));
-  Block *block1 = as_block(prev_outer_size, bytes.subspan(0, outer_size1));
-  Block *block2 = as_block(outer_size1, bytes.subspan(outer_size1));
-
-  if (is_last)
-    block2->mark_last();
-  else
-    block2->next()->prev_ = block2->next_;
-
-  block = cpp::move(block1);
-  return block2;
+  bool has_next = next();
+  ByteSpan new_region = region().subspan(outer_size1);
+  LIBC_ASSERT(!used() && "used blocks cannot be split");
+  // The low order bits of outer_size1 should both be zero, and is the correct
+  // value for the flags is false.
+  next_ = outer_size1;
+  LIBC_ASSERT(!used() && next() && "incorrect first split flags");
+  Block *new_block = as_block(outer_size1, new_region);
+
+  if (has_next) {
+    // The two flags are both false, so next_ is a plain size.
+    LIBC_ASSERT(!new_block->used() && next() && "flags disrupt use of size");
+    new_block->next()->prev_ = new_block->next_;
+  } else {
+    new_block->mark_last();
+  }
+  return new_block;
 }
 
 template <typename OffsetType, size_t kAlign>
-bool Block<OffsetType, kAlign>::merge_next(Block *&block) {
-  if (block == nullptr)
-    return false;
-
-  if (block->last())
-    return false;
-
-  Block *next = block->next();
-  if (block->used() || next->used())
+bool Block<OffsetType, kAlign>::merge_next() {
+  if (used() || !next() || next()->used())
     return false;
 
-  size_t prev_outer_size = block->prev_;
-  bool is_last = next->last();
-  ByteSpan prev_bytes = as_bytes(cpp::move(block));
-  ByteSpan next_bytes = as_bytes(cpp::move(next));
-  size_t outer_size = prev_bytes.size() + next_bytes.size();
-  cpp::byte *merged = ::new (prev_bytes.data()) cpp::byte[outer_size];
-  block = as_block(prev_outer_size, ByteSpan(merged, outer_size));
+  // Extend the size and copy the last() flag from the next block to this one.
+  next_ &= SIZE_MASK;
+  next_ += next()->next_;
 
-  if (is_last)
-    block->mark_last();
-  else
-    block->next()->prev_ = block->next_;
+  if (next()) {
+    // The two flags are both false, so next_ is a plain size.
+    LIBC_ASSERT(!used() && next() && "flags disrupt use of size");
+    next()->prev_ = next_;
+  }
 
   return true;
 }
 
 template <typename OffsetType, size_t kAlign>
 Block<OffsetType, kAlign> *Block<OffsetType, kAlign>::next() const {
-  uintptr_t addr =
-      last() ? 0 : reinterpret_cast<uintptr_t>(this) + outer_size();
-  return reinterpret_cast<Block *>(addr);
+  if (next_ & LAST_MASK)
+    return nullptr;
+  return reinterpret_cast<Block *>(reinterpret_cast<uintptr_t>(this) +
+                                   outer_size());
 }
 
 template <typename OffsetType, size_t kAlign>
@@ -555,13 +487,6 @@ constexpr Block<OffsetType, kAlign>::Block(size_t prev_outer_size,
   next_ = outer_size;
 }
 
-template <typename OffsetType, size_t kAlign>
-ByteSpan Block<OffsetType, kAlign>::as_bytes(Block *&&block) {
-  size_t block_size = block->outer_size();
-  cpp::byte *bytes = new (cpp::move(block)) cpp::byte[block_size];
-  return {bytes, block_size};
-}
-
 template <typename OffsetType, size_t kAlign>
 Block<OffsetType, kAlign> *
 Block<OffsetType, kAlign>::as_block(size_t prev_outer_size, ByteSpan bytes) {
@@ -573,7 +498,7 @@ internal::BlockStatus Block<OffsetType, kAlign>::check_status() const {
   if (reinterpret_cast<uintptr_t>(this) % ALIGNMENT != 0)
     return internal::BlockStatus::MISALIGNED;
 
-  if (!last() && (this >= next() || this != next()->prev()))
+  if (next() && (this >= next() || this != next()->prev()))
     return internal::BlockStatus::NEXT_MISMATCHED;
 
   if (prev() && (this <= prev() || this != prev()->next()))
diff --git a/libc/src/__support/freelist_heap.h b/libc/src/__support/freelist_heap.h
index ce4f14b431585..a2c714e15ba87 100644
--- a/libc/src/__support/freelist_heap.h
+++ b/libc/src/__support/freelist_heap.h
@@ -189,19 +189,19 @@ template <size_t NUM_BUCKETS> void FreeListHeap<NUM_BUCKETS>::free(void *ptr) {
   BlockType *prev = chunk_block->prev();
   BlockType *next = nullptr;
 
-  if (!chunk_block->last())
+  if (chunk_block->next())
     next = chunk_block->next();
 
   if (prev != nullptr && !prev->used()) {
     // Remove from freelist and merge
     freelist_.remove_chunk(block_to_span(prev));
     chunk_block = chunk_block->prev();
-    BlockType::merge_next(chunk_block);
+    chunk_block->merge_next();
   }
 
   if (next != nullptr && !next->used()) {
     freelist_.remove_chunk(block_to_span(next));
-    BlockType::merge_next(chunk_block);
+    chunk_block->merge_next();
   }
   // Add back to the freelist
   freelist_.add_chunk(block_to_span(chunk_block));
diff --git a/libc/test/src/__support/block_test.cpp b/libc/test/src/__support/block_test.cpp
index 04704482b5147..ecce00b7926f9 100644
--- a/libc/test/src/__support/block_test.cpp
+++ b/libc/test/src/__support/block_test.cpp
@@ -54,7 +54,6 @@ TEST_FOR_EACH_BLOCK_TYPE(CanCreateSingleAlignedBlock) {
   EXPECT_EQ(block->prev(), static_cast<BlockType *>(nullptr));
   EXPECT_EQ(block->next(), static_cast<BlockType *>(nullptr));
   EXPECT_FALSE(block->used());
-  EXPECT_TRUE(block->last());
 }
 
 TEST_FOR_EACH_BLOCK_TYPE(CanCreateUnalignedSingleBlock) {
@@ -96,18 +95,16 @@ TEST_FOR_EACH_BLOCK_TYPE(CanSplitBlock) {
   ASSERT_TRUE(result.has_value());
   auto *block1 = *result;
 
-  result = BlockType::split(block1, kSplitN);
+  result = block1->split(kSplitN);
   ASSERT_TRUE(result.has_value());
 
   auto *block2 = *result;
 
   EXPECT_EQ(block1->inner_size(), kSplitN);
   EXPECT_EQ(block1->outer_size(), kSplitN + BlockType::BLOCK_OVERHEAD);
-  EXPECT_FALSE(block1->last());
 
   EXPECT_EQ(block2->outer_size(), kN - kSplitN - BlockType::BLOCK_OVERHEAD);
   EXPECT_FALSE(block2->used());
-  EXPECT_TRUE(block2->last());
 
   EXPECT_EQ(block1->next(), block2);
   EXPECT_EQ(block2->prev(), block1);
@@ -128,7 +125,7 @@ TEST_FOR_EACH_BLOCK_TYPE(CanSplitBlockUnaligned) {
   split_addr += alignof(BlockType) - (split_addr % alignof(BlockType));
   uintptr_t split_len = split_addr - (uintptr_t)&bytes;
 
-  result = BlockType::split(block1, kSplitN);
+  result = block1->split(kSplitN);
   ASSERT_TRUE(result.has_value());
   BlockType *block2 = *result;
 
@@ -161,11 +158,11 @@ TEST_FOR_EACH_BLOCK_TYPE(CanSplitMidBlock) {
   ASSERT_TRUE(result.has_value());
   BlockType *block1 = *result;
 
-  result = BlockType::split(block1, kSplit1);
+  result = block1->split(kSplit1);
   ASSERT_TRUE(result.has_value());
   BlockType *block2 = *result;
 
-  result = BlockType::split(block1, kSplit2);
+  result = block1->split(kSplit2);
   ASSERT_TRUE(result.has_value());
   BlockType *block3 = *result;
 
@@ -184,7 +181,7 @@ TEST_FOR_EACH_BLOCK_TYPE(CannotSplitTooSmallBlock) {
   ASSERT_TRUE(result.has_value());
   BlockType *block = *result;
 
-  result = BlockType::split(block, kSplitN);
+  result = block->split(kSplitN);
   ASSERT_FALSE(result.has_value());
 }
 
@@ -197,13 +194,7 @@ TEST_FOR_EACH_BLOCK_TYPE(CannotSplitBlockWithoutHeaderSpace) {
   ASSERT_TRUE(result.has_value());
   BlockType *block = *result;
 
-  result = BlockType::split(block, kSplitN);
-  ASSERT_FALSE(result.has_value());
-}
-
-TEST_FOR_EACH_BLOCK_TYPE(CannotSplitNull) {
-  BlockType *block = nullptr;
-  auto result = BlockType::split(block, 1);
+  result = block->split(kSplitN);
   ASSERT_FALSE(result.has_value());
 }
 
@@ -216,7 +207,7 @@ TEST_FOR_EACH_BLOCK_TYPE(CannotMakeBlockLargerInSplit) {
   ASSERT_TRUE(result.has_value());
   BlockType *block = *result;
 
-  result = BlockType::split(block, block->inner_size() + 1);
+  result = block->split(block->inner_size() + 1);
   ASSERT_FALSE(result.has_value());
 }
 
@@ -229,8 +220,7 @@ TEST_FOR_EACH_BLOCK_TYPE(CannotMakeSecondBlockLargerInSplit) {
   ASSERT_TRUE(result.has_value());
   BlockType *block = *result;
 
-  result = BlockType::split(block, block->inner_size() -
-                                       BlockType::BLOCK_OVERHEAD + 1);
+  result = block->split(block->inner_size() - BlockType::BLOCK_OVERHEAD + 1);
   ASSERT_FALSE(result.has_value());
 }
 
@@ -243,7 +233,7 @@ TEST_FOR_EACH_BLOCK_TYPE(CanMakeZeroSizeFirstBlock) {
   ASSERT_TRUE(result.has_value());
   BlockType *block = *result;
 
-  result = BlockType::split(block, 0);
+  result = block->split(0);
   ASSERT_TRUE(result.has_value());
   EXPECT_EQ(block->inner_size(), static_cast<size_t>(0));
 }
@@ -257,8 +247,7 @@ TEST_FOR_EACH_BLOCK_TYPE(CanMakeZeroSizeSecondBlock) {
   ASSERT_TRUE(result.has_value());
   BlockType *block1 = *result;
 
-  result = BlockType::split(block1,
-                            block1->inner_size() - BlockType::BLOCK_OVERHEAD);
+  result = block1->split(block1->inner_size() - BlockType::BLOCK_OVERHEAD);
   ASSERT_TRUE(result.has_value());
   BlockType *block2 = *result;
 
@@ -293,7 +282,7 @@ TEST_FOR_EACH_BLOCK_TYPE(CannotSplitUsedBlock) {
   BlockType *block = *result;
 
   block->mark_used();
-  result = BlockType::split(block, kSplitN);
+  result = block->split(kSplitN);
   ASSERT_FALSE(result.has_value());
 }
 
@@ -309,14 +298,14 @@ TEST_FOR_EACH_BLOCK_TYPE(CanMergeWithNextBlock) {
   ASSERT_TRUE(result.has_value());
   BlockType *block1 = *result;
 
-  result = BlockType::split(block1, kSplit1);
+  result = block1->split(kSplit1);
   ASSERT_TRUE(result.has_value());
 
-  result = BlockType::split(block1, kSplit2);
+  result = block1->split(kSplit2);
   ASSERT_TRUE(result.has_value());
   BlockType *block3 = *result;
 
-  EXPECT_TRUE(BlockType::merge_next(block3));
+  EXPECT_TRUE(block3->merge_next());
 
   EXPECT_EQ(block1->next(), block3);
   EXPECT_EQ(block3->prev(), block1);
@@ -334,16 +323,11 @@ TEST_FOR_EACH_BLOCK_TYPE(CannotMergeWithFirstOrLastBlock) {
   BlockType *block1 = *result;
 
   // Do a split, just to check that the checks on next/prev are different...
-  result = BlockType::split(block1, kSplitN);
+  result = block1->split(kSplitN);
   ASSERT_TRUE(result.has_value());
   BlockType *block2 = *result;
 
-  EXPECT_FALSE(BlockType::merge_next(block2));
-}
-
-TEST_FOR_EACH_BLOCK_TYPE(CannotMergeNull) {
-  BlockType *block = nullptr;
-  EXPECT_FALSE(BlockType::merge_next(block));
+  EXPECT_FALSE(block2->merge_next());
 }
 
 TEST_FOR_EACH_BLOCK_TYPE(CannotMergeUsedBlock) {
@@ -356,131 +340,11 @@ TEST_FOR_EACH_BLOCK_TYPE(CannotMergeUsedBlock) {
   BlockType *block = *result;
 
   // Do a split, just to check that the checks on next/prev are different...
-  result = BlockType::split(block, kSplitN);
-  ASSERT_TRUE(result.has_value());
-
-  block->mark_used();
-  EXPECT_FALSE(BlockType::merge_next(block));
-}
-
-TEST_FOR_EACH_BLOCK_TYPE(CanFreeSingleBlock) {
-  constexpr size_t kN = 1024;
-  alignas(BlockType::ALIGNMENT) array<byte, kN> bytes;
-
-  auto result = BlockType::init(bytes);
+  result = block->split(kSplitN);
   ASSERT_TRUE(result.has_value());
-  BlockType *block = *result;
 
   block->mark_used();
-  BlockType::free(block);
-  EXPECT_FALSE(block->used());
-  EXPECT_EQ(block->outer_size(), kN);
-}
-
-TEST_FOR_EACH_BLOCK_TYPE(CanFreeBlockWithoutMerging) {
-  constexpr size_t kN = 1024;
-  constexpr size_t kSplit1 = 512;
-  constexpr size_t kSplit2 = 256;
-
-  alignas(BlockType::ALIGNMENT) array<byte, kN> bytes;
-  auto result = BlockType::init(bytes);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block1 = *result;
-
-  result = BlockType::split(block1, kSplit1);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block2 = *result;
-
-  result = BlockType::split(block2, kSplit2);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block3 = *result;
-
-  block1->mark_used();
-  block2->mark_used();
-  block3->mark_used();
-
-  BlockType::free(block2);
-  EXPECT_FALSE(block2->used());
-  EXPECT_NE(block2->prev(), static_cast<BlockType *>(nullptr));
-  EXPECT_FALSE(block2->last());
-}
-
-TEST_FOR_EACH_BLOCK_TYPE(CanFreeBlockAndMergeWithPrev) {
-  constexpr size_t kN = 1024;
-  constexpr size_t kSplit1 = 512;
-  constexpr size_t kSplit2 = 256;
-
-  alignas(BlockType::ALIGNMENT) array<byte, kN> bytes;
-  auto result = BlockType::init(bytes);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block1 = *result;
-
-  result = BlockType::split(block1, kSplit1);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block2 = *result;
-
-  result = BlockType::split(block2, kSplit2);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block3 = *result;
-
-  block2->mark_used();
-  block3->mark_used();
-
-  BlockType::free(block2);
-  EXPECT_FALSE(block2->used());
-  EXPECT_EQ(block2->prev(), static_cast<BlockType *>(nullptr));
-  EXPECT_FALSE(block2->last());
-}
-
-TEST_FOR_EACH_BLOCK_TYPE(CanFreeBlockAndMergeWithNext) {
-  constexpr size_t kN = 1024;
-  constexpr size_t kSplit1 = 512;
-  constexpr size_t kSplit2 = 256;
-
-  alignas(BlockType::ALIGNMENT) array<byte, kN> bytes;
-  auto result = BlockType::init(bytes);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block1 = *result;
-
-  result = BlockType::split(block1, kSplit1);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block2 = *result;
-
-  result = BlockType::split(block2, kSplit2);
-  ASSERT_TRUE(result.has_value());
-
-  block1->mark_used();
-  block2->mark_used();
-
-  BlockType::free(block2);
-  EXPECT_FALSE(block2->used());
-  EXPECT_NE(block2->prev(), static_cast<BlockType *>(nullptr));
-  EXPECT_TRUE(block2->last());
-}
-
-TEST_FOR_EACH_BLOCK_TYPE(CanFreeUsedBlockAndMergeWithBoth) {
-  constexpr size_t kN = 1024;
-  constexpr size_t kSplit1 = 512;
-  constexpr size_t kSplit2 = 256;
-
-  alignas(BlockType::ALIGNMENT) array<byte, kN> bytes;
-  auto result = BlockType::init(bytes);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block1 = *result;
-
-  result = BlockType::split(block1, kSplit1);
-  ASSERT_TRUE(result.has_value());
-  BlockType *block2 = *result;
-
-  result = BlockType::split(block2, kSplit2);
-  ASSERT_TRUE(result.has_value());
-
-  block2->mark_used();
-
-  BlockType::free(block2);
-  EXPECT_FALSE(block2->used());
-  EXPECT_EQ(block2->prev(), static_cast<BlockType *>(nullptr));
-  EXPECT_TRUE(block2->last());
+  EXPECT_FALSE(block->merge_next());
 }
 
 TEST_FOR_EACH_BLOCK_TYPE(CanCheckValidBlock) {
@@ -493,11 +357,11 @@ TEST_FOR_EACH_BLOCK_TYPE(CanCheckValidBlock) {
   ASSERT_TRUE(result.has_value());
   BlockType *block1 = *result;
 
-  result = BlockType::split(block1, kSplit1);
+  result = block1->split(kSplit1);
   ASSERT_TRUE(result.has_value());
   BlockType *block2 = *result;
 
-  result = BlockType::split(block2, kSplit2);
+  result = block2->split(kSplit2);
   ASSERT_TRUE(result.has_value());
   BlockType *block3 = *result;
 
@@ -517,15 +381,15 @@ TEST_FOR_EACH_BLOCK_TYPE(CanCheckInvalidBlock) {
   ASSERT_TRUE(result.has_value());
   BlockType *block1 = *result;
 
-  result = BlockType::split(block1, kSplit1);
+  result = block1->split(kSplit1);
   ASSERT_TRUE(result.has_value());
   BlockType *block2 = *result;
 
-  result = BlockType::split(block2, kSplit2);
+  result = block2->split(kSplit2);
   ASSERT_TRUE(result.has_value());
   BlockType *block3 = *result;
 
-  result = BlockType::split(block3, kSplit3);
+  result = block3->split(kSplit3);
   ASSERT_TRUE(result.has_value());
 
   // Corrupt a Block header.
@@ -630,7 +494,6 @@ TEST_FOR_EACH_BLOCK_TYPE(AllocateAlreadyAligned) {
   // Check the next block.
   EXPECT_NE(next, static_cast<BlockType *>(nullptr));
   EXPECT_EQ(aligned_block->next(), next);
-  EXPECT_EQ(next->next(), static_cast<BlockType *>(nullptr));
   EXPECT_EQ(reinterpret_cast<byte *>(next) + next->outer_size(),
             bytes.data() + bytes.size());
 }
@@ -674,7 +537,6 @@ TEST_FOR_EACH_BLOCK_TYPE(AllocateNeedsAlignment) {
   // Check the next block.
   EXPECT_NE(next, static_cast<BlockType *>(nullptr));
   EXPECT_EQ(aligned_block->next(), next);
-  EXPECT_EQ(next->next(), static_cast<BlockType *>(nullptr));
   EXPECT_EQ(reinterpret_cast<byte *>(next) + next->outer_size(), &*bytes.end());
 }
 
@@ -687,7 +549,7 @@ TEST_FOR_EACH_BLOCK_TYPE(PreviousBlockMergedIfNotFirst) {
   BlockType *block = *result;
 
   // Split the block roughly halfway and work on the second half.
-  auto result2 = BlockType::split(block, kN / 2);
+  auto result2 = block->split(kN / 2);
   ASSERT_TRUE(result2.has_value());
   BlockType *newblock = *result2;
   ASSERT_EQ(newblock->prev(), block);
@@ -744,20 +606,17 @@ TEST_FOR_EACH_BLOCK_TYPE(CanRemergeBlockAllocations) {
 
   // Check we have the appropriate blocks.
   ASSERT_NE(prev, static_cast<BlockType *>(nullptr));
-  ASSERT_FALSE(prev->last());
   ASSERT_EQ(aligned_block->prev(), prev);
   EXPECT_NE(next, static_cast<BlockType *>(nullptr));
   EXPECT_NE(next, static_cast<BlockType *>(nullptr));
   EXPECT_EQ(aligned_block->next(), next);
   EXPECT_EQ(next->next(), static_cast<BlockType *>(nullptr));
-  ASSERT_TRUE(next->last());
 
   // Now check for successful merges.
-  EXPECT_TRUE(BlockType::merge_next(prev));
+  EXPECT_TRUE(prev->merge_next());
   EXPECT_EQ(prev->next(), next);
-  EXPECT_TRUE(BlockType::merge_next(prev));
+  EXPECT_TRUE(prev->merge_next());
   EXPECT_EQ(prev->next(), static_cast<BlockType *>(nullptr));
-  EXPECT_TRUE(prev->last());
 
   // We should have the original buffer.
   EXPECT_EQ(reinterpret_cast<byte *>(prev), &*bytes.begin());

>From 792b673d36a86ab9d45ed3b259a3fc7020826b76 Mon Sep 17 00:00:00 2001
From: Vladislav Dzhidzhoev <vdzhidzhoev at accesssoftek.com>
Date: Thu, 25 Jul 2024 20:25:40 +0200
Subject: [PATCH 181/259] [lldb][test] Mark TestAssertMessages.py as passing
 remotely on Linux target. (#100586)

It has been markad as XFAIL here
de2ddc8f3146bd87152ea86b533541039541efe1, but I haven't found the reason
for that, and apparently, it passes against the Linux target.
---
 lldb/test/API/assert_messages_test/TestAssertMessages.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/test/API/assert_messages_test/TestAssertMessages.py b/lldb/test/API/assert_messages_test/TestAssertMessages.py
index 3c54b9379d4c1..27d18e5ae99b9 100644
--- a/lldb/test/API/assert_messages_test/TestAssertMessages.py
+++ b/lldb/test/API/assert_messages_test/TestAssertMessages.py
@@ -23,7 +23,7 @@ def assert_expect_fails_with(self, cmd, expect_args, expected_msg):
         else:
             self.fail("Initial expect should have raised AssertionError!")
 
-    @expectedFailureAll(remote=True)
+    @expectedFailureAll(oslist=no_match(["linux"]), remote=True)
     def test_createTestTarget(self):
         try:
             self.createTestTarget("doesnt_exist")

>From 82c21f085a10e64fc5c21683224714be03e8506f Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail at justinbogner.com>
Date: Thu, 25 Jul 2024 11:30:03 -0700
Subject: [PATCH 182/259] [DXIL][Analysis] Replace #include with forward
 declaration. NFC

Pull Request: https://github.com/llvm/llvm-project/pull/100622
---
 llvm/include/llvm/Analysis/DXILResource.h | 3 ++-
 llvm/lib/Analysis/DXILResource.cpp        | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Analysis/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h
index ac1cefd98dbe3..cca5e0f0bd759 100644
--- a/llvm/include/llvm/Analysis/DXILResource.h
+++ b/llvm/include/llvm/Analysis/DXILResource.h
@@ -9,11 +9,12 @@
 #ifndef LLVM_ANALYSIS_DXILRESOURCE_H
 #define LLVM_ANALYSIS_DXILRESOURCE_H
 
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/DXILABI.h"
 
 namespace llvm {
+class MDTuple;
+
 namespace dxil {
 
 struct ResourceBinding {
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index cbe634c4b91aa..d47a73c05a3e5 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -9,6 +9,7 @@
 #include "llvm/Analysis/DXILResource.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Metadata.h"
 
 using namespace llvm;
 using namespace dxil;

>From 4f8050806e14d9f7791bfaa33681cacb1ac02549 Mon Sep 17 00:00:00 2001
From: Scott Todd <scott.todd0 at gmail.com>
Date: Thu, 25 Jul 2024 11:37:31 -0700
Subject: [PATCH 183/259] [lld] Add explicit conversion for enum to Twine.
 (#100627)

This fixes `error: ambiguous conversion for functional-style cast from
'lld::macho::InputSection::Kind' to 'llvm::Twine'`, observed when
building with clang-9 and reported here:
https://github.com/llvm/llvm-project/pull/96268#discussion_r1691909931.
---
 lld/MachO/BPSectionOrderer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lld/MachO/BPSectionOrderer.cpp b/lld/MachO/BPSectionOrderer.cpp
index 26d4e0cb3987d..f6a974370836b 100644
--- a/lld/MachO/BPSectionOrderer.cpp
+++ b/lld/MachO/BPSectionOrderer.cpp
@@ -48,9 +48,9 @@ getRelocHash(const Reloc &reloc,
     sectionIdx = sectionIdxIt->getSecond();
   std::string kind;
   if (isec)
-    kind = ("Section " + Twine(isec->kind())).str();
+    kind = ("Section " + Twine((uint8_t)isec->kind())).str();
   if (auto *sym = reloc.referent.dyn_cast<Symbol *>()) {
-    kind += (" Symbol " + Twine(sym->kind())).str();
+    kind += (" Symbol " + Twine((uint8_t)sym->kind())).str();
     if (auto *d = dyn_cast<Defined>(sym)) {
       if (isa_and_nonnull<CStringInputSection>(isec))
         return getRelocHash(kind, 0, isec->getOffset(d->value), reloc.addend);

>From 2b162286eddcf0624b2a7731ea1b6d52bb332e8a Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 25 Jul 2024 13:42:44 -0500
Subject: [PATCH 184/259] [NVPTX] Correctly forward the PTX feature to the
 nvlink wrapper (#100607)

Summary:
This is necessary for LTO when the user specifies it or has a CUDA
version that supports a sufficiently high version. Previously it would
default.

Fixes https://github.com/llvm/llvm-project/issues/100606
---
 clang/lib/Driver/ToolChains/Cuda.cpp     | 7 +++++++
 clang/test/Driver/cuda-cross-compiling.c | 8 ++++++++
 2 files changed, 15 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index ffe5f4a6f4a25..e98e574d6cc2b 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -625,6 +625,13 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0],
                   C.getDriver().getLTOMode() == LTOK_Thin);
 
+  // Forward the PTX features if the nvlink-wrapper needs it.
+  std::vector<StringRef> Features;
+  getNVPTXTargetFeatures(C.getDriver(), getToolChain().getTriple(), Args,
+                         Features);
+  for (StringRef Feature : Features)
+    CmdArgs.append({"--feature", Args.MakeArgString(Feature)});
+
   addGPULibraries(getToolChain(), Args, CmdArgs);
 
   // Add paths for the default clang library path.
diff --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c
index 42d56cbfcc321..c2e538c25329e 100644
--- a/clang/test/Driver/cuda-cross-compiling.c
+++ b/clang/test/Driver/cuda-cross-compiling.c
@@ -90,3 +90,11 @@
 // RUN:   | FileCheck -check-prefix=GENERIC %s
 
 // GENERIC-NOT: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-target-cpu"
+
+//
+// Test forwarding the necessary +ptx feature.
+//
+// RUN: %clang -target nvptx64-nvidia-cuda --cuda-feature=+ptx63 -march=sm_52 -### %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=FEATURE %s
+
+// FEATURE: clang-nvlink-wrapper{{.*}}"--feature" "+ptx63"

>From 0307dd0d5dea6fbc3b218850e7061cd40a80fc58 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Thu, 25 Jul 2024 13:45:53 -0500
Subject: [PATCH 185/259] [libc++] Remove unnecessary check for
 LIBCXX_INCLUDE_TESTS (#100573)

This whole CMakeLists.txt is only included from the parent directory if
LIBCXX_INCLUDE_TESTS is true.
---
 libcxx/test/CMakeLists.txt | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index cdd1c2d90fbcf..2200de6a36ce7 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -34,24 +34,21 @@ endif()
 
 serialize_lit_params_list(SERIALIZED_LIT_PARAMS LIBCXX_TEST_PARAMS)
 
-if (LIBCXX_INCLUDE_TESTS)
-  include(AddLLVM) # for configure_lit_site_cfg and add_lit_testsuite
+include(AddLLVM) # for configure_lit_site_cfg and add_lit_testsuite
 
-  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/configs/cmake-bridge.cfg.in"
-                 "${CMAKE_CURRENT_BINARY_DIR}/cmake-bridge.cfg"
-                 @ONLY)
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/configs/cmake-bridge.cfg.in"
+                "${CMAKE_CURRENT_BINARY_DIR}/cmake-bridge.cfg"
+                @ONLY)
 
-  configure_lit_site_cfg(
-    "${LIBCXX_TEST_CONFIG}"
-    ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
-    MAIN_CONFIG "${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py")
+configure_lit_site_cfg(
+  "${LIBCXX_TEST_CONFIG}"
+  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
+  MAIN_CONFIG "${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py")
 
-  add_lit_testsuite(check-cxx
-    "Running libcxx tests"
-    ${CMAKE_CURRENT_BINARY_DIR}
-    DEPENDS cxx-test-depends)
-
-endif()
+add_lit_testsuite(check-cxx
+  "Running libcxx tests"
+  ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS cxx-test-depends)
 
 if (LIBCXX_GENERATE_COVERAGE)
   include(CodeCoverage)

>From 52b6b1eb976341956c9c75f6572c37d445fcb332 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Thu, 25 Jul 2024 13:46:53 -0500
Subject: [PATCH 186/259] [libc++] Remove unmaintained LIBCXX_CONFIGURE_IDE
 (#100574)

The premise of LIBCXX_CONFIGURE_IDE is nice, however in practice this
setting has not been maintained and as a result it basically doesn't
work properly. For example, it doesn't take into account the headers we
generate, and its handling of the tests is too naive for it to be really
helpful.

This patch removes the setting in order to simplify the CMake setup a
bit, but most importantly to remove unnecessary interactions between our
main CMake build of the library and the test suite.
---
 libcxx/CMakeLists.txt      |  7 -------
 libcxx/src/CMakeLists.txt  | 14 --------------
 libcxx/test/CMakeLists.txt | 23 -----------------------
 3 files changed, 44 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 674082c7d1787..917c6becb7835 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -314,13 +314,6 @@ option(LIBCXX_GENERATE_COVERAGE "Enable generating code coverage." OFF)
 set(LIBCXX_COVERAGE_LIBRARY "" CACHE STRING
     "The Profile-rt library used to build with code coverage")
 
-set(LIBCXX_CONFIGURE_IDE_DEFAULT OFF)
-if (XCODE OR MSVC_IDE)
-  set(LIBCXX_CONFIGURE_IDE_DEFAULT ON)
-endif()
-option(LIBCXX_CONFIGURE_IDE "Configure libcxx for use within an IDE"
-      ${LIBCXX_CONFIGURE_IDE_DEFAULT})
-
 set(LIBCXX_HERMETIC_STATIC_LIBRARY_DEFAULT OFF)
 if (WIN32)
   set(LIBCXX_HERMETIC_STATIC_LIBRARY_DEFAULT ON)
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index 0dfc9647558d4..bfc88c4c58812 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -143,20 +143,6 @@ if (LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS)
     )
 endif()
 
-# Add all the headers to the project for IDEs.
-if (LIBCXX_CONFIGURE_IDE)
-  file(GLOB_RECURSE LIBCXX_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../include/*)
-  if(WIN32)
-    file( GLOB LIBCXX_WIN32_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../include/__support/win32/*.h)
-    list(APPEND LIBCXX_HEADERS ${LIBCXX_WIN32_HEADERS})
-  endif()
-  # Force them all into the headers dir on MSVC, otherwise they end up at
-  # project scope because they don't have extensions.
-  if (MSVC_IDE)
-    source_group("Header Files" FILES ${LIBCXX_HEADERS})
-  endif()
-endif()
-
 if(NOT LIBCXX_INSTALL_LIBRARY)
   set(exclude_from_all EXCLUDE_FROM_ALL)
 endif()
diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index 2200de6a36ce7..001b29efcfefa 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -61,26 +61,3 @@ if (LIBCXX_GENERATE_COVERAGE)
   set(extract_dirs "${LIBCXX_SOURCE_DIR}/include;${LIBCXX_SOURCE_DIR}/src")
   setup_lcov_test_target_coverage("cxx" "${output_dir}" "${capture_dirs}" "${extract_dirs}")
 endif()
-
-if (LIBCXX_CONFIGURE_IDE)
-  # Create dummy targets for each of the tests in the test suite, this allows
-  # IDE's such as CLion to correctly highlight the tests because it knows
-  # roughly what include paths/compile flags/macro definitions are needed.
-  include_directories(support)
-  file(GLOB_RECURSE LIBCXX_TESTS ${CMAKE_CURRENT_SOURCE_DIR}/*.pass.cpp)
-  file(GLOB LIBCXX_TEST_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/support/*)
-  file(GLOB_RECURSE LIBCXX_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../include/*)
-  add_executable(libcxx_test_objects EXCLUDE_FROM_ALL
-          ${LIBCXX_TESTS} ${LIBCXX_TEST_HEADERS} ${LIBCXX_HEADERS})
-  add_dependencies(libcxx_test_objects cxx)
-
-  split_list(LIBCXX_COMPILE_FLAGS)
-  split_list(LIBCXX_LINK_FLAGS)
-
-  set_target_properties(libcxx_test_objects
-          PROPERTIES
-            COMPILE_FLAGS "${LIBCXX_COMPILE_FLAGS}"
-            LINK_FLAGS "${LIBCXX_LINK_FLAGS}"
-            EXCLUDE_FROM_ALL ON
-  )
-endif()

>From 6618eca5f506ca1bde3c7cf43a91dd5121d12e8b Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot at gmail.com>
Date: Thu, 25 Jul 2024 18:51:00 +0000
Subject: [PATCH 187/259] [gn build] Port b365dbbd8dcc

---
 llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn               | 1 +
 llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn       | 1 -
 llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn         | 1 +
 llvm/utils/gn/secondary/llvm/unittests/Transforms/Utils/BUILD.gn | 1 -
 4 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
index 4f03e01c39c1e..e47a5f7db5528 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
@@ -39,6 +39,7 @@ static_library("Analysis") {
     "CycleAnalysis.cpp",
     "DDG.cpp",
     "DDGPrinter.cpp",
+    "DXILResource.cpp",
     "Delinearization.cpp",
     "DemandedBits.cpp",
     "DependenceAnalysis.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
index 6ea9c8e4f4748..58acabf85d296 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
@@ -26,7 +26,6 @@ static_library("Utils") {
     "CodeMoverUtils.cpp",
     "CountVisits.cpp",
     "CtorUtils.cpp",
-    "DXILResource.cpp",
     "DXILUpgrade.cpp",
     "Debugify.cpp",
     "DemoteRegToStack.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn
index 339994d35af94..4749ef0a733da 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn
@@ -24,6 +24,7 @@ unittest("AnalysisTests") {
     "CaptureTrackingTest.cpp",
     "ConstraintSystemTest.cpp",
     "DDGTest.cpp",
+    "DXILResourceTest.cpp",
     "DomTreeUpdaterTest.cpp",
     "FunctionPropertiesAnalysisTest.cpp",
     "GlobalsModRefTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Transforms/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Transforms/Utils/BUILD.gn
index 4a924dab02908..380ed71a2bc01 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Transforms/Utils/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Transforms/Utils/BUILD.gn
@@ -19,7 +19,6 @@ unittest("UtilsTests") {
     "CodeExtractorTest.cpp",
     "CodeLayoutTest.cpp",
     "CodeMoverUtilsTest.cpp",
-    "DXILResourceTest.cpp",
     "DebugifyTest.cpp",
     "FunctionComparatorTest.cpp",
     "IntegerDivisionTest.cpp",

>From cc4f98979b079b517edd8a71f56a8975f436e63d Mon Sep 17 00:00:00 2001
From: Anton Korobeynikov <anton at korobeynikov.info>
Date: Thu, 25 Jul 2024 11:57:46 -0700
Subject: [PATCH 188/259] Normalize ptrauth handling in sanitizer runtime
 (#100483)

1. Include `ptrauth.h` if `ptrauth_intrinsics` language feature is specified (per ptrauth spec, this is what enables `ptrauh.h` usage and functions like `ptrauth_strip`)
 2. For PAC-RET fallback implement two changes:
    1. Switch to macro, so we can ignore key argument
    2. Ensure the unsigned value is erased from LR, so the possibility of gadget reuse is reduced.

Fixes #100467
---
 .../lib/sanitizer_common/sanitizer_ptrauth.h  | 46 ++++++++++---------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h b/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h
index 5200354694851..b5215c0d49c06 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h
@@ -9,31 +9,33 @@
 #ifndef SANITIZER_PTRAUTH_H
 #define SANITIZER_PTRAUTH_H
 
-#if __has_feature(ptrauth_calls)
-#include <ptrauth.h>
+#if __has_feature(ptrauth_intrinsics)
+#  include <ptrauth.h>
 #elif defined(__ARM_FEATURE_PAC_DEFAULT) && !defined(__APPLE__)
-inline unsigned long ptrauth_strip(void* __value, unsigned int __key) {
-  // On the stack the link register is protected with Pointer
-  // Authentication Code when compiled with -mbranch-protection.
-  // Let's stripping the PAC unconditionally because xpaclri is in
-  // the NOP space so will do nothing when it is not enabled or not available.
-  unsigned long ret;
-  asm volatile(
-      "mov x30, %1\n\t"
-      "hint #7\n\t"  // xpaclri
-      "mov %0, x30\n\t"
-      : "=r"(ret)
-      : "r"(__value)
-      : "x30");
-  return ret;
-}
-#define ptrauth_auth_data(__value, __old_key, __old_data) __value
-#define ptrauth_string_discriminator(__string) ((int)0)
+// On the stack the link register is protected with Pointer
+// Authentication Code when compiled with -mbranch-protection.
+// Let's stripping the PAC unconditionally because xpaclri is in
+// the NOP space so will do nothing when it is not enabled or not available.
+#  define ptrauth_strip(__value, __key) \
+    ({                                  \
+      unsigned long ret;                \
+      asm volatile(                     \
+          "mov x30, %1\n\t"             \
+          "hint #7\n\t"                 \
+          "mov %0, x30\n\t"             \
+          "mov x30, xzr\n\t"            \
+          : "=r"(ret)                   \
+          : "r"(__value)                \
+          : "x30");                     \
+      ret;                              \
+    })
+#  define ptrauth_auth_data(__value, __old_key, __old_data) __value
+#  define ptrauth_string_discriminator(__string) ((int)0)
 #else
 // Copied from <ptrauth.h>
-#define ptrauth_strip(__value, __key) __value
-#define ptrauth_auth_data(__value, __old_key, __old_data) __value
-#define ptrauth_string_discriminator(__string) ((int)0)
+#  define ptrauth_strip(__value, __key) __value
+#  define ptrauth_auth_data(__value, __old_key, __old_data) __value
+#  define ptrauth_string_discriminator(__string) ((int)0)
 #endif
 
 #define STRIP_PAC_PC(pc) ((uptr)ptrauth_strip(pc, 0))

>From 81ce796095c8e2ededa5d9df1c963cf08a75a6f1 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty at gmail.com>
Date: Thu, 25 Jul 2024 21:09:49 +0200
Subject: [PATCH 189/259] [libc][math][c23] Enable C23 _Float16 math functions
 on GPUs (#99248)

---
 .../cmake/modules/CheckCompilerFeatures.cmake | 33 +++++++--
 libc/cmake/modules/LLVMLibCFlagRules.cmake    |  6 +-
 libc/config/gpu/entrypoints.txt               | 68 +++++++++++++++++++
 .../macros/properties/cpu_features.h          |  4 ++
 libc/src/math/generic/CMakeLists.txt          | 12 ++--
 libc/src/math/generic/ceilf16.cpp             |  4 +-
 libc/src/math/generic/floorf16.cpp            |  4 +-
 libc/src/math/generic/rintf16.cpp             |  4 +-
 libc/src/math/generic/roundevenf16.cpp        |  4 +-
 libc/src/math/generic/roundf16.cpp            |  5 +-
 libc/src/math/generic/truncf16.cpp            |  4 +-
 11 files changed, 121 insertions(+), 27 deletions(-)

diff --git a/libc/cmake/modules/CheckCompilerFeatures.cmake b/libc/cmake/modules/CheckCompilerFeatures.cmake
index 8b02c590f11d0..63145fe709dda 100644
--- a/libc/cmake/modules/CheckCompilerFeatures.cmake
+++ b/libc/cmake/modules/CheckCompilerFeatures.cmake
@@ -17,6 +17,12 @@ set(
 # Making sure ALL_COMPILER_FEATURES is sorted.
 list(SORT ALL_COMPILER_FEATURES)
 
+# Compiler features that are unavailable on GPU targets with the in-tree Clang.
+set(
+  CPU_ONLY_COMPILER_FEATURES
+    "float128"
+)
+
 # Function to check whether the compiler supports the provided set of features.
 # Usage:
 # compiler_supports(
@@ -67,13 +73,26 @@ foreach(feature IN LISTS ALL_COMPILER_FEATURES)
     set(CMAKE_TRY_COMPILE_TARGET_TYPE EXECUTABLE)
   endif()
 
-  try_compile(
-    has_feature
-    ${CMAKE_CURRENT_BINARY_DIR}/compiler_features
-    SOURCES ${LIBC_SOURCE_DIR}/cmake/modules/compiler_features/check_${feature}.cpp
-    COMPILE_DEFINITIONS -I${LIBC_SOURCE_DIR} ${compile_options}
-    LINK_OPTIONS ${link_options}
-  )
+  if(LIBC_TARGET_OS_IS_GPU)
+    # CUDA shouldn't be required to build the libc, only to test it, so we can't
+    # try to build CUDA binaries here. Since GPU builds are always compiled with
+    # the in-tree Clang, we just hardcode which compiler features are available
+    # when targeting GPUs.
+    if(feature IN_LIST CPU_ONLY_COMPILER_FEATURES)
+      set(has_feature FALSE)
+    else()
+      set(has_feature TRUE)
+    endif()
+  else()
+    try_compile(
+      has_feature
+      ${CMAKE_CURRENT_BINARY_DIR}/compiler_features
+      SOURCES ${LIBC_SOURCE_DIR}/cmake/modules/compiler_features/check_${feature}.cpp
+      COMPILE_DEFINITIONS -I${LIBC_SOURCE_DIR} ${compile_options}
+      LINK_OPTIONS ${link_options}
+    )
+  endif()
+
   if(has_feature)
     list(APPEND AVAILABLE_COMPILER_FEATURES ${feature})
     if(${feature} STREQUAL "float16")
diff --git a/libc/cmake/modules/LLVMLibCFlagRules.cmake b/libc/cmake/modules/LLVMLibCFlagRules.cmake
index 4398fe55db5aa..3629a7f111a7c 100644
--- a/libc/cmake/modules/LLVMLibCFlagRules.cmake
+++ b/libc/cmake/modules/LLVMLibCFlagRules.cmake
@@ -279,8 +279,10 @@ if(NOT(LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "SSE2")))
   set(SKIP_FLAG_EXPANSION_EXPLICIT_SIMD_OPT TRUE)
 endif()
 
-# Skip ROUND_OPT flag for targets that don't support SSE 4.2.
+# Skip ROUND_OPT flag for targets that don't support rounding instructions. On
+# x86, these are SSE4.1 instructions, but we already had code to check for
+# SSE4.2 support.
 if(NOT((LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "SSE4_2")) OR
-       LIBC_TARGET_ARCHITECTURE_IS_AARCH64))
+       LIBC_TARGET_ARCHITECTURE_IS_AARCH64 OR LIBC_TARGET_OS_IS_GPU))
   set(SKIP_FLAG_EXPANSION_ROUND_OPT TRUE)
 endif()
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index f7e89bdcd95cc..df7aa9e319624 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -348,6 +348,74 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.truncf
 )
 
+if(LIBC_TYPES_HAS_FLOAT16)
+  list(APPEND TARGET_LIBM_ENTRYPOINTS
+    # math.h C23 _Float16 entrypoints
+    libc.src.math.canonicalizef16
+    libc.src.math.ceilf16
+    libc.src.math.copysignf16
+    libc.src.math.f16add
+    libc.src.math.f16addf
+    libc.src.math.f16div
+    libc.src.math.f16divf
+    libc.src.math.f16fma
+    libc.src.math.f16fmaf
+    libc.src.math.f16mul
+    libc.src.math.f16mulf
+    libc.src.math.f16sqrt
+    libc.src.math.f16sqrtf
+    libc.src.math.f16sub
+    libc.src.math.f16subf
+    libc.src.math.fabsf16
+    libc.src.math.fdimf16
+    libc.src.math.floorf16
+    libc.src.math.fmaxf16
+    libc.src.math.fmaximum_mag_numf16
+    libc.src.math.fmaximum_magf16
+    libc.src.math.fmaximum_numf16
+    libc.src.math.fmaximumf16
+    libc.src.math.fminf16
+    libc.src.math.fminimum_mag_numf16
+    libc.src.math.fminimum_magf16
+    libc.src.math.fminimum_numf16
+    libc.src.math.fminimumf16
+    libc.src.math.fmodf16
+    libc.src.math.frexpf16
+    libc.src.math.fromfpf16
+    libc.src.math.fromfpxf16
+    libc.src.math.getpayloadf16
+    libc.src.math.ilogbf16
+    libc.src.math.ldexpf16
+    libc.src.math.llogbf16
+    libc.src.math.llrintf16
+    libc.src.math.llroundf16
+    libc.src.math.logbf16
+    libc.src.math.lrintf16
+    libc.src.math.lroundf16
+    libc.src.math.modff16
+    libc.src.math.nanf16
+    libc.src.math.nearbyintf16
+    libc.src.math.nextafterf16
+    libc.src.math.nextdownf16
+    libc.src.math.nexttowardf16
+    libc.src.math.nextupf16
+    libc.src.math.remainderf16
+    libc.src.math.remquof16
+    libc.src.math.rintf16
+    libc.src.math.roundevenf16
+    libc.src.math.roundf16
+    libc.src.math.scalblnf16
+    libc.src.math.scalbnf16
+    libc.src.math.setpayloadf16
+    libc.src.math.setpayloadsigf16
+    libc.src.math.totalorderf16
+    libc.src.math.totalordermagf16
+    libc.src.math.truncf16
+    libc.src.math.ufromfpf16
+    libc.src.math.ufromfpxf16
+  )
+endif()
+
 set(TARGET_LLVMLIBC_ENTRYPOINTS
   ${TARGET_LIBC_ENTRYPOINTS}
   ${TARGET_LIBM_ENTRYPOINTS}
diff --git a/libc/src/__support/macros/properties/cpu_features.h b/libc/src/__support/macros/properties/cpu_features.h
index 8d431f203335f..d2cea367516db 100644
--- a/libc/src/__support/macros/properties/cpu_features.h
+++ b/libc/src/__support/macros/properties/cpu_features.h
@@ -53,4 +53,8 @@
 #define LIBC_TARGET_CPU_HAS_NEAREST_INT
 #endif
 
+#if defined(LIBC_TARGET_ARCH_IS_AARCH64) || defined(LIBC_TARGET_ARCH_IS_GPU)
+#define LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS
+#endif
+
 #endif // LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_CPU_FEATURES_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 80e862542e29c..7280add7c8bb0 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -111,7 +111,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -548,7 +548,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -617,7 +617,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -686,7 +686,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -755,7 +755,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -948,7 +948,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
diff --git a/libc/src/math/generic/ceilf16.cpp b/libc/src/math/generic/ceilf16.cpp
index 708bc4cfd4860..8af31c6623a02 100644
--- a/libc/src/math/generic/ceilf16.cpp
+++ b/libc/src/math/generic/ceilf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, ceilf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) &&                       \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_ceilf(x));
 #else
   return fputil::ceil(x);
diff --git a/libc/src/math/generic/floorf16.cpp b/libc/src/math/generic/floorf16.cpp
index 84e4b0730ac68..3092048f5ab06 100644
--- a/libc/src/math/generic/floorf16.cpp
+++ b/libc/src/math/generic/floorf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, floorf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) &&                       \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_floorf(x));
 #else
   return fputil::floor(x);
diff --git a/libc/src/math/generic/rintf16.cpp b/libc/src/math/generic/rintf16.cpp
index 0e8c091efcf9b..3a53dd28e3d10 100644
--- a/libc/src/math/generic/rintf16.cpp
+++ b/libc/src/math/generic/rintf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, rintf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) &&                       \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_rintf(x));
 #else
   return fputil::round_using_current_rounding_mode(x);
diff --git a/libc/src/math/generic/roundevenf16.cpp b/libc/src/math/generic/roundevenf16.cpp
index b45670bd24ff1..c3dbd779b9739 100644
--- a/libc/src/math/generic/roundevenf16.cpp
+++ b/libc/src/math/generic/roundevenf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, roundevenf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_ROUNDEVEN) &&                                   \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_roundevenf(x));
 #else
   return fputil::round_using_specific_rounding_mode(x, FP_INT_TONEAREST);
diff --git a/libc/src/math/generic/roundf16.cpp b/libc/src/math/generic/roundf16.cpp
index cb668c0e76388..a5e2b44fbd54b 100644
--- a/libc/src/math/generic/roundf16.cpp
+++ b/libc/src/math/generic/roundf16.cpp
@@ -10,12 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, roundf16, (float16 x)) {
-#if defined(__LIBC_USE_BUILTIN_ROUND) && defined(LIBC_TARGET_ARCH_IS_AARCH64)
+#if defined(__LIBC_USE_BUILTIN_ROUND) &&                                       \
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_roundf(x));
 #else
   return fputil::round(x);
diff --git a/libc/src/math/generic/truncf16.cpp b/libc/src/math/generic/truncf16.cpp
index b931053e53438..31b1214a9a0e4 100644
--- a/libc/src/math/generic/truncf16.cpp
+++ b/libc/src/math/generic/truncf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, truncf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) &&                       \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_truncf(x));
 #else
   return fputil::trunc(x);

>From 56fd2472d887392855ad85c53df5782a2c3f8ddb Mon Sep 17 00:00:00 2001
From: Daniil Kovalev <dkovalev at accesssoftek.com>
Date: Thu, 25 Jul 2024 22:21:03 +0300
Subject: [PATCH 190/259] [PAC] Sign LR with B key for non-leaf functions with
 ptrauth-returns attr (#100552)

For pauthtest ABI, there is a bunch of ptrauth-* options, including
ptrauth-returns. Use "ptrauth-returns" function attribute to indicate
need for LR signing with B key for non-leaf function to avoid using
"sign-return-address" and "sign-return-address-key" which were
originally designed for pac-ret.

Co-authored-by: Ahmed Bougacha <ahmed at bougacha.org>
Co-authored-by: Anatoly Trosinenko <atrosinenko at accesssoftek.com>
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |   7 +-
 .../AArch64/AArch64MachineFunctionInfo.cpp    |   4 +
 .../lib/Target/AArch64/AArch64PointerAuth.cpp |   3 +-
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp  |   9 +-
 llvm/lib/Target/AArch64/AArch64Subtarget.h    |   3 +-
 llvm/test/CodeGen/AArch64/ptrauth-ret-trap.ll |  98 ++++++++
 llvm/test/CodeGen/AArch64/ptrauth-ret.ll      | 225 ++++++++++++++++++
 7 files changed, 344 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/ptrauth-ret-trap.ll
 create mode 100644 llvm/test/CodeGen/AArch64/ptrauth-ret.ll

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 702c2831e013f..6cd9a1a817086 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -8340,7 +8340,8 @@ AArch64InstrInfo::getOutliningCandidateInfo(
     NumBytesToCreateFrame += 8;
 
     // PAuth is enabled - set extra tail call cost, if any.
-    auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod();
+    auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
+        *RepeatedSequenceLocs[0].getMF());
     NumBytesToCheckLRInTCEpilogue =
         AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod);
     // Checking the authenticated LR value may significantly impact
@@ -8701,6 +8702,10 @@ void AArch64InstrInfo::mergeOutliningCandidateAttributes(
   // behaviour of one of them
   const auto &CFn = Candidates.front().getMF()->getFunction();
 
+  if (CFn.hasFnAttribute("ptrauth-returns"))
+    F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
+  if (CFn.hasFnAttribute("ptrauth-auth-traps"))
+    F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
   // Since all candidates belong to the same module, just copy the
   // function-level attributes of an arbitrary function.
   if (CFn.hasFnAttribute("sign-return-address"))
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index 201e8047b3686..e96c5a953ff2b 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -38,6 +38,8 @@ void AArch64FunctionInfo::initializeBaseYamlFields(
 }
 
 static std::pair<bool, bool> GetSignReturnAddress(const Function &F) {
+  if (F.hasFnAttribute("ptrauth-returns"))
+    return {true, false}; // non-leaf
   // The function should be signed in the following situations:
   // - sign-return-address=all
   // - sign-return-address=non-leaf and the functions spills the LR
@@ -56,6 +58,8 @@ static std::pair<bool, bool> GetSignReturnAddress(const Function &F) {
 }
 
 static bool ShouldSignWithBKey(const Function &F, const AArch64Subtarget &STI) {
+  if (F.hasFnAttribute("ptrauth-returns"))
+    return true;
   if (!F.hasFnAttribute("sign-return-address-key")) {
     if (STI.getTargetTriple().isOSWindows())
       return true;
diff --git a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
index 465e689d4a7a5..92ab4b5c3d251 100644
--- a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
@@ -341,7 +341,8 @@ bool AArch64PointerAuth::checkAuthenticatedLR(
   AArch64PACKey::ID KeyId =
       MFnI->shouldSignWithBKey() ? AArch64PACKey::IB : AArch64PACKey::IA;
 
-  AuthCheckMethod Method = Subtarget->getAuthenticatedLRCheckMethod();
+  AuthCheckMethod Method =
+      Subtarget->getAuthenticatedLRCheckMethod(*TI->getMF());
 
   if (Method == AuthCheckMethod::None)
     return false;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 32a355fe38f1c..642006e706c13 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -565,8 +565,13 @@ bool AArch64Subtarget::useAA() const { return UseAA; }
 // exception on its own. Later, if the callee spills the signed LR value and
 // neither FEAT_PAuth2 nor FEAT_EPAC are implemented, the valid PAC replaces
 // the higher bits of LR thus hiding the authentication failure.
-AArch64PAuth::AuthCheckMethod
-AArch64Subtarget::getAuthenticatedLRCheckMethod() const {
+AArch64PAuth::AuthCheckMethod AArch64Subtarget::getAuthenticatedLRCheckMethod(
+    const MachineFunction &MF) const {
+  // TODO: Check subtarget for the scheme. Present variant is a default for
+  // pauthtest ABI.
+  if (MF.getFunction().hasFnAttribute("ptrauth-returns") &&
+      MF.getFunction().hasFnAttribute("ptrauth-auth-traps"))
+    return AArch64PAuth::AuthCheckMethod::HighBitsNoTBI;
   if (AuthenticatedLRCheckMethod.getNumOccurrences())
     return AuthenticatedLRCheckMethod;
 
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index e585aad2f7a68..0f3a637f98fbe 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -413,7 +413,8 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   }
 
   /// Choose a method of checking LR before performing a tail call.
-  AArch64PAuth::AuthCheckMethod getAuthenticatedLRCheckMethod() const;
+  AArch64PAuth::AuthCheckMethod
+  getAuthenticatedLRCheckMethod(const MachineFunction &MF) const;
 
   /// Compute the integer discriminator for a given BlockAddress constant, if
   /// blockaddress signing is enabled, or std::nullopt otherwise.
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-ret-trap.ll b/llvm/test/CodeGen/AArch64/ptrauth-ret-trap.ll
new file mode 100644
index 0000000000000..42a3050eda112
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ptrauth-ret-trap.ll
@@ -0,0 +1,98 @@
+; RUN: llc -mtriple aarch64-linux-gnu -mattr=+pauth -asm-verbose=false -disable-post-ra -o - %s | FileCheck %s
+
+; CHECK-LABEL:  test_tailcall:
+; CHECK-NEXT:   pacibsp
+; CHECK-NEXT:   str x30, [sp, #-16]!
+; CHECK-NEXT:   bl bar
+; CHECK-NEXT:   ldr x30, [sp], #16
+; CHECK-NEXT:   autibsp
+; CHECK-NEXT:   eor x16, x30, x30, lsl #1
+; CHECK-NEXT:   tbnz x16, #62, [[BAD:.L.*]]
+; CHECK-NEXT:   b bar
+; CHECK-NEXT:   [[BAD]]:
+; CHECK-NEXT:   brk #0xc471
+define i32 @test_tailcall() #0 {
+  call i32 @bar()
+  %c = tail call i32 @bar()
+  ret i32 %c
+}
+
+; CHECK-LABEL: test_tailcall_noframe:
+; CHECK-NEXT:  b bar
+define i32 @test_tailcall_noframe() #0 {
+  %c = tail call i32 @bar()
+  ret i32 %c
+}
+
+; CHECK-LABEL: test_tailcall_indirect:
+; CHECK:         autibsp
+; CHECK:         eor     x16, x30, x30, lsl #1
+; CHECK:         tbnz    x16, #62, [[BAD:.L.*]]
+; CHECK:         br      x0
+; CHECK: [[BAD]]:
+; CHECK:         brk     #0xc471
+define void @test_tailcall_indirect(ptr %fptr) #0 {
+  call i32 @test_tailcall()
+  tail call void %fptr()
+  ret void
+}
+
+; CHECK-LABEL: test_tailcall_indirect_in_x9:
+; CHECK:         autibsp
+; CHECK:         eor     x16, x30, x30, lsl #1
+; CHECK:         tbnz    x16, #62, [[BAD:.L.*]]
+; CHECK:         br      x9
+; CHECK: [[BAD]]:
+; CHECK:         brk     #0xc471
+define void @test_tailcall_indirect_in_x9(ptr sret(i64) %ret, [8 x i64] %in, ptr %fptr) #0 {
+  %ptr = alloca i8, i32 16
+  call i32 @test_tailcall()
+  tail call void %fptr(ptr sret(i64) %ret, [8 x i64] %in)
+  ret void
+}
+
+; CHECK-LABEL: test_auth_tailcall_indirect:
+; CHECK:         autibsp
+; CHECK:         eor     x16, x30, x30, lsl #1
+; CHECK:         tbnz    x16, #62, [[BAD:.L.*]]
+; CHECK:         mov x16, #42
+; CHECK:         braa      x0, x16
+; CHECK: [[BAD]]:
+; CHECK:         brk     #0xc471
+define void @test_auth_tailcall_indirect(ptr %fptr) #0 {
+  call i32 @test_tailcall()
+  tail call void %fptr() [ "ptrauth"(i32 0, i64 42) ]
+  ret void
+}
+
+; CHECK-LABEL: test_auth_tailcall_indirect_in_x9:
+; CHECK:         autibsp
+; CHECK:         eor     x16, x30, x30, lsl #1
+; CHECK:         tbnz    x16, #62, [[BAD:.L.*]]
+; CHECK:         brabz      x9
+; CHECK: [[BAD]]:
+; CHECK:         brk     #0xc471
+define void @test_auth_tailcall_indirect_in_x9(ptr sret(i64) %ret, [8 x i64] %in, ptr %fptr) #0 {
+  %ptr = alloca i8, i32 16
+  call i32 @test_tailcall()
+  tail call void %fptr(ptr sret(i64) %ret, [8 x i64] %in) [ "ptrauth"(i32 1, i64 0) ]
+  ret void
+}
+
+; CHECK-LABEL: test_auth_tailcall_indirect_bti:
+; CHECK:         autibsp
+; CHECK:         eor     x17, x30, x30, lsl #1
+; CHECK:         tbnz    x17, #62, [[BAD:.L.*]]
+; CHECK:         brabz      x16
+; CHECK: [[BAD]]:
+; CHECK:         brk     #0xc471
+define void @test_auth_tailcall_indirect_bti(ptr sret(i64) %ret, [8 x i64] %in, ptr %fptr) #0 "branch-target-enforcement"="true" {
+  %ptr = alloca i8, i32 16
+  call i32 @test_tailcall()
+  tail call void %fptr(ptr sret(i64) %ret, [8 x i64] %in) [ "ptrauth"(i32 1, i64 0) ]
+  ret void
+}
+
+declare i32 @bar()
+
+attributes #0 = { nounwind "ptrauth-returns" "ptrauth-auth-traps" }
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-ret.ll b/llvm/test/CodeGen/AArch64/ptrauth-ret.ll
new file mode 100644
index 0000000000000..61f5f6d9d23b7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ptrauth-ret.ll
@@ -0,0 +1,225 @@
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -verify-machineinstrs -disable-post-ra \
+; RUN:   -global-isel=0 -o - %s | FileCheck %s
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -verify-machineinstrs -disable-post-ra \
+; RUN:   -global-isel=1 -global-isel-abort=1 -o - %s | FileCheck %s
+
+define i32 @test() #0 {
+; CHECK-LABEL: test:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    str x19, [sp, #-16]!
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr x19, [sp], #16
+; CHECK-NEXT:    ret
+  call void asm sideeffect "", "~{x19}"()
+  ret i32 0
+}
+
+define i32 @test_alloca() #0 {
+; CHECK-LABEL: test_alloca:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+  %p = alloca i8, i32 32
+  call void asm sideeffect "", "r"(ptr %p)
+  ret i32 0
+}
+
+define i32 @test_realign_alloca() #0 {
+; CHECK-LABEL: test_realign_alloca:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]!
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub x9, sp, #112
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffff80
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16
+; CHECK-NEXT:    retab
+  %p = alloca i8, i32 32, align 128
+  call void asm sideeffect "", "r"(ptr %p)
+  ret i32 0
+}
+
+define i32 @test_big_alloca() #0 {
+; CHECK-LABEL: test_big_alloca:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]!
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    ldr x29, [sp], #16
+; CHECK-NEXT:    ret
+  %p = alloca i8, i32 1024
+  call void asm sideeffect "", "r"(ptr %p)
+  ret i32 0
+}
+
+define i32 @test_var_alloca(i32 %s) #0 {
+  %p = alloca i8, i32 %s
+  call void asm sideeffect "", "r"(ptr %p)
+  ret i32 0
+}
+
+define i32 @test_noframe_saved(ptr %p) #0 {
+; CHECK-LABEL: test_noframe_saved:
+; CHECK:       %bb.0:
+
+
+; CHECK-NEXT:  str     x29, [sp, #-96]!
+; CHECK-NEXT:  stp     x28, x27, [sp, #16]
+; CHECK-NEXT:  stp     x26, x25, [sp, #32]
+; CHECK-NEXT:  stp     x24, x23, [sp, #48]
+; CHECK-NEXT:  stp     x22, x21, [sp, #64]
+; CHECK-NEXT:  stp     x20, x19, [sp, #80]
+; CHECK-NEXT:  ldr     w29, [x0]
+; CHECK-NEXT:  //APP
+; CHECK-NEXT:  //NO_APP
+; CHECK-NEXT:  mov     w0, w29
+; CHECK-NEXT:  ldp     x20, x19, [sp, #80]
+; CHECK-NEXT:  ldp     x22, x21, [sp, #64]
+; CHECK-NEXT:  ldp     x24, x23, [sp, #48]
+; CHECK-NEXT:  ldp     x26, x25, [sp, #32]
+; CHECK-NEXT:  ldp     x28, x27, [sp, #16]
+; CHECK-NEXT:  ldr     x29, [sp], #96
+; CHECK-NEXT:  ret
+  %v = load i32, ptr %p
+  call void asm sideeffect "", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28}"()
+  ret i32 %v
+}
+
+define void @test_noframe() #0 {
+; CHECK-LABEL: test_noframe:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    ret
+  ret void
+}
+
+; FIXME: Inefficient lowering of @llvm.returnaddress
+define ptr @test_returnaddress_0() #0 {
+; CHECK-LABEL: test_returnaddress_0:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    str x30, [sp, #-16]!
+; CHECK-NEXT:    xpaci x30
+; CHECK-NEXT:    mov x0, x30
+; CHECK-NEXT:    ldr x30, [sp], #16
+; CHECK-NEXT:    retab
+  %r = call ptr @llvm.returnaddress(i32 0)
+  ret ptr %r
+}
+
+define ptr @test_returnaddress_1() #0 {
+; CHECK-LABEL: test_returnaddress_1:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]!
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    ldr x8, [x29]
+; CHECK-NEXT:    ldr x0, [x8, #8]
+; CHECK-NEXT:    xpaci x0
+; CHECK-NEXT:    ldp x29, x30, [sp], #16
+; CHECK-NEXT:    retab
+  %r = call ptr @llvm.returnaddress(i32 1)
+  ret ptr %r
+}
+
+define void @test_noframe_alloca() #0 {
+; CHECK-LABEL: test_noframe_alloca:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #12
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %p = alloca i8, i32 1
+  call void asm sideeffect "", "r"(ptr %p)
+  ret void
+}
+
+define void @test_call() #0 {
+; CHECK-LABEL: test_call:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    str x30, [sp, #-16]!
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    ldr x30, [sp], #16
+; CHECK-NEXT:    retab
+  call i32 @bar()
+  ret void
+}
+
+define void @test_call_alloca() #0 {
+; CHECK-LABEL: test_call_alloca:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    str x30, [sp, #-16]
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    ldr x30, [sp], #16
+; CHECK-NEXT:    retab
+  alloca i8
+  call i32 @bar()
+  ret void
+}
+
+define void @test_call_shrinkwrapping(i1 %c) #0 {
+; CHECK-LABEL: test_call_shrinkwrapping:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    tbz w0, #0, .LBB12_2
+; CHECK-NEXT:  %bb.1:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    str x30, [sp, #-16]!
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    ldr x30, [sp], #16
+; CHECK-NEXT:    autibsp
+; CHECK-NEXT:  LBB12_2:
+; CHECK-NEXT:    ret
+  br i1 %c, label %tbb, label %fbb
+tbb:
+  call i32 @bar()
+  br label %fbb
+fbb:
+  ret void
+}
+
+define i32 @test_tailcall() #0 {
+; CHECK-LABEL: test_tailcall:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    str x30, [sp, #-16]!
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    ldr x30, [sp], #16
+; CHECK-NEXT:    autibsp
+; CHECK-NEXT:    b bar
+  call i32 @bar()
+  %c = tail call i32 @bar()
+  ret i32 %c
+}
+
+define i32 @test_tailcall_noframe() #0 {
+; CHECK-LABEL: test_tailcall_noframe:
+; CHECK:       %bb.0:
+; CHECK-NEXT:    b bar
+  %c = tail call i32 @bar()
+  ret i32 %c
+}
+
+declare i32 @bar()
+
+declare ptr @llvm.returnaddress(i32)
+
+attributes #0 = { nounwind "ptrauth-returns" }

>From ea202f9f2ec0cb19fa34c373ad1929e1aedcaf8c Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Thu, 25 Jul 2024 12:18:19 -0700
Subject: [PATCH 191/259] [LV,RISCV] Regenerate a test to reduce spurious
 deltas in upcoming change

---
 .../LoopVectorize/RISCV/short-trip-count.ll   | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
index bce965c723101..b699b246a328a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
@@ -8,23 +8,23 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 4, [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 4, [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 [[TMP7]], i32 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP9]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i32> poison)
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <vscale x 2 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP10]], ptr [[TMP9]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 [[TMP5]], i32 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP7]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 2 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP8]], ptr [[TMP7]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -67,23 +67,23 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 4, [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 4, [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP7]], i32 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP10]], ptr [[TMP9]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP5]], i32 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]

>From 59e91d4c5094641ffa0763c17599b1d0d7c6fbb0 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail at justinbogner.com>
Date: Thu, 25 Jul 2024 12:27:57 -0700
Subject: [PATCH 192/259] [DXIL][Analysis] Make the DXILResource binding
 optional. NFC

This makes the binding structure in a DXILResource default to empty
and need a separate call to set up, and also moves the unique ID into
it since bindings are the only place where those are actually used.

This will put us in a better position when dealing with resource
handles in libraries.

Pull Request: https://github.com/llvm/llvm-project/pull/100623
---
 llvm/include/llvm/Analysis/DXILResource.h    | 88 +++++++++----------
 llvm/lib/Analysis/DXILResource.cpp           | 92 ++++++++------------
 llvm/unittests/Analysis/DXILResourceTest.cpp | 78 ++++++++---------
 3 files changed, 115 insertions(+), 143 deletions(-)

diff --git a/llvm/include/llvm/Analysis/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h
index cca5e0f0bd759..d4006ae10837c 100644
--- a/llvm/include/llvm/Analysis/DXILResource.h
+++ b/llvm/include/llvm/Analysis/DXILResource.h
@@ -17,19 +17,22 @@ class MDTuple;
 
 namespace dxil {
 
-struct ResourceBinding {
-  uint32_t Space;
-  uint32_t LowerBound;
-  uint32_t Size;
-
-  bool operator==(const ResourceBinding &RHS) const {
-    return std::tie(Space, LowerBound, Size) ==
-           std::tie(RHS.Space, RHS.LowerBound, RHS.Size);
-  }
-  bool operator!=(const ResourceBinding &RHS) const { return !(*this == RHS); }
-};
-
 class ResourceInfo {
+  struct ResourceBinding {
+    uint32_t UniqueID;
+    uint32_t Space;
+    uint32_t LowerBound;
+    uint32_t Size;
+
+    bool operator==(const ResourceBinding &RHS) const {
+      return std::tie(UniqueID, Space, LowerBound, Size) ==
+             std::tie(RHS.UniqueID, RHS.Space, RHS.LowerBound, RHS.Size);
+    }
+    bool operator!=(const ResourceBinding &RHS) const {
+      return !(*this == RHS);
+    }
+  };
+
   struct UAVInfo {
     bool GloballyCoherent;
     bool HasCounter;
@@ -81,12 +84,11 @@ class ResourceInfo {
   Value *Symbol;
   StringRef Name;
 
-  ResourceBinding Binding;
-  uint32_t UniqueID;
-
   dxil::ResourceClass RC;
   dxil::ResourceKind Kind;
 
+  ResourceBinding Binding = {};
+
   // Resource class dependent properties.
   // CBuffer, Sampler, and RawBuffer end here.
   union {
@@ -114,70 +116,62 @@ class ResourceInfo {
   bool isMultiSample() const;
 
   ResourceInfo(dxil::ResourceClass RC, dxil::ResourceKind Kind, Value *Symbol,
-               StringRef Name, ResourceBinding Binding, uint32_t UniqueID)
-      : Symbol(Symbol), Name(Name), Binding(Binding), UniqueID(UniqueID),
-        RC(RC), Kind(Kind) {}
+               StringRef Name)
+      : Symbol(Symbol), Name(Name), RC(RC), Kind(Kind) {}
 
 public:
   static ResourceInfo SRV(Value *Symbol, StringRef Name,
-                          ResourceBinding Binding, uint32_t UniqueID,
                           dxil::ElementType ElementTy, uint32_t ElementCount,
                           dxil::ResourceKind Kind);
-  static ResourceInfo RawBuffer(Value *Symbol, StringRef Name,
-                                ResourceBinding Binding, uint32_t UniqueID);
+  static ResourceInfo RawBuffer(Value *Symbol, StringRef Name);
   static ResourceInfo StructuredBuffer(Value *Symbol, StringRef Name,
-                                       ResourceBinding Binding,
-                                       uint32_t UniqueID, uint32_t Stride,
-                                       Align Alignment);
+                                       uint32_t Stride, Align Alignment);
   static ResourceInfo Texture2DMS(Value *Symbol, StringRef Name,
-                                  ResourceBinding Binding, uint32_t UniqueID,
                                   dxil::ElementType ElementTy,
                                   uint32_t ElementCount, uint32_t SampleCount);
-  static ResourceInfo
-  Texture2DMSArray(Value *Symbol, StringRef Name, ResourceBinding Binding,
-                   uint32_t UniqueID, dxil::ElementType ElementTy,
-                   uint32_t ElementCount, uint32_t SampleCount);
+  static ResourceInfo Texture2DMSArray(Value *Symbol, StringRef Name,
+                                       dxil::ElementType ElementTy,
+                                       uint32_t ElementCount,
+                                       uint32_t SampleCount);
 
   static ResourceInfo UAV(Value *Symbol, StringRef Name,
-                          ResourceBinding Binding, uint32_t UniqueID,
                           dxil::ElementType ElementTy, uint32_t ElementCount,
                           bool GloballyCoherent, bool IsROV,
                           dxil::ResourceKind Kind);
   static ResourceInfo RWRawBuffer(Value *Symbol, StringRef Name,
-                                  ResourceBinding Binding, uint32_t UniqueID,
                                   bool GloballyCoherent, bool IsROV);
   static ResourceInfo RWStructuredBuffer(Value *Symbol, StringRef Name,
-                                         ResourceBinding Binding,
-                                         uint32_t UniqueID, uint32_t Stride,
+                                         uint32_t Stride,
                                          Align Alignment, bool GloballyCoherent,
                                          bool IsROV, bool HasCounter);
   static ResourceInfo RWTexture2DMS(Value *Symbol, StringRef Name,
-                                    ResourceBinding Binding, uint32_t UniqueID,
                                     dxil::ElementType ElementTy,
                                     uint32_t ElementCount, uint32_t SampleCount,
                                     bool GloballyCoherent);
-  static ResourceInfo
-  RWTexture2DMSArray(Value *Symbol, StringRef Name, ResourceBinding Binding,
-                     uint32_t UniqueID, dxil::ElementType ElementTy,
-                     uint32_t ElementCount, uint32_t SampleCount,
-                     bool GloballyCoherent);
+  static ResourceInfo RWTexture2DMSArray(Value *Symbol, StringRef Name,
+                                         dxil::ElementType ElementTy,
+                                         uint32_t ElementCount,
+                                         uint32_t SampleCount,
+                                         bool GloballyCoherent);
   static ResourceInfo FeedbackTexture2D(Value *Symbol, StringRef Name,
-                                        ResourceBinding Binding,
-                                        uint32_t UniqueID,
                                         dxil::SamplerFeedbackType FeedbackTy);
   static ResourceInfo
-  FeedbackTexture2DArray(Value *Symbol, StringRef Name, ResourceBinding Binding,
-                         uint32_t UniqueID,
+  FeedbackTexture2DArray(Value *Symbol, StringRef Name,
                          dxil::SamplerFeedbackType FeedbackTy);
 
-  static ResourceInfo CBuffer(Value *Symbol, StringRef Name,
-                              ResourceBinding Binding, uint32_t UniqueID,
-                              uint32_t Size);
+  static ResourceInfo CBuffer(Value *Symbol, StringRef Name, uint32_t Size);
 
   static ResourceInfo Sampler(Value *Symbol, StringRef Name,
-                              ResourceBinding Binding, uint32_t UniqueID,
                               dxil::SamplerType SamplerTy);
 
+  void bind(uint32_t UniqueID, uint32_t Space, uint32_t LowerBound,
+            uint32_t Size) {
+    Binding.UniqueID = UniqueID;
+    Binding.Space = Space;
+    Binding.LowerBound = LowerBound;
+    Binding.Size = Size;
+  }
+
   bool operator==(const ResourceInfo &RHS) const;
 
   MDTuple *getAsMetadata(LLVMContext &Ctx) const;
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index d47a73c05a3e5..72cba9d4373bb 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -64,10 +64,9 @@ bool ResourceInfo::isMultiSample() const {
 }
 
 ResourceInfo ResourceInfo::SRV(Value *Symbol, StringRef Name,
-                               ResourceBinding Binding, uint32_t UniqueID,
                                ElementType ElementTy, uint32_t ElementCount,
                                ResourceKind Kind) {
-  ResourceInfo RI(ResourceClass::SRV, Kind, Symbol, Name, Binding, UniqueID);
+  ResourceInfo RI(ResourceClass::SRV, Kind, Symbol, Name);
   assert(RI.isTyped() && !(RI.isStruct() || RI.isMultiSample()) &&
          "Invalid ResourceKind for SRV constructor.");
   RI.Typed.ElementTy = ElementTy;
@@ -75,43 +74,37 @@ ResourceInfo ResourceInfo::SRV(Value *Symbol, StringRef Name,
   return RI;
 }
 
-ResourceInfo ResourceInfo::RawBuffer(Value *Symbol, StringRef Name,
-                                     ResourceBinding Binding,
-                                     uint32_t UniqueID) {
-  ResourceInfo RI(ResourceClass::SRV, ResourceKind::RawBuffer, Symbol, Name,
-                  Binding, UniqueID);
+ResourceInfo ResourceInfo::RawBuffer(Value *Symbol, StringRef Name) {
+  ResourceInfo RI(ResourceClass::SRV, ResourceKind::RawBuffer, Symbol, Name);
   return RI;
 }
 
 ResourceInfo ResourceInfo::StructuredBuffer(Value *Symbol, StringRef Name,
-                                            ResourceBinding Binding,
-                                            uint32_t UniqueID, uint32_t Stride,
-                                            Align Alignment) {
+                                            uint32_t Stride, Align Alignment) {
   ResourceInfo RI(ResourceClass::SRV, ResourceKind::StructuredBuffer, Symbol,
-                  Name, Binding, UniqueID);
+                  Name);
   RI.Struct.Stride = Stride;
   RI.Struct.Alignment = Alignment;
   return RI;
 }
 
 ResourceInfo ResourceInfo::Texture2DMS(Value *Symbol, StringRef Name,
-                                       ResourceBinding Binding,
-                                       uint32_t UniqueID, ElementType ElementTy,
+                                       ElementType ElementTy,
                                        uint32_t ElementCount,
                                        uint32_t SampleCount) {
-  ResourceInfo RI(ResourceClass::SRV, ResourceKind::Texture2DMS, Symbol, Name,
-                  Binding, UniqueID);
+  ResourceInfo RI(ResourceClass::SRV, ResourceKind::Texture2DMS, Symbol, Name);
   RI.Typed.ElementTy = ElementTy;
   RI.Typed.ElementCount = ElementCount;
   RI.MultiSample.Count = SampleCount;
   return RI;
 }
 
-ResourceInfo ResourceInfo::Texture2DMSArray(
-    Value *Symbol, StringRef Name, ResourceBinding Binding, uint32_t UniqueID,
-    ElementType ElementTy, uint32_t ElementCount, uint32_t SampleCount) {
+ResourceInfo ResourceInfo::Texture2DMSArray(Value *Symbol, StringRef Name,
+                                            ElementType ElementTy,
+                                            uint32_t ElementCount,
+                                            uint32_t SampleCount) {
   ResourceInfo RI(ResourceClass::SRV, ResourceKind::Texture2DMSArray, Symbol,
-                  Name, Binding, UniqueID);
+                  Name);
   RI.Typed.ElementTy = ElementTy;
   RI.Typed.ElementCount = ElementCount;
   RI.MultiSample.Count = SampleCount;
@@ -119,11 +112,10 @@ ResourceInfo ResourceInfo::Texture2DMSArray(
 }
 
 ResourceInfo ResourceInfo::UAV(Value *Symbol, StringRef Name,
-                               ResourceBinding Binding, uint32_t UniqueID,
                                ElementType ElementTy, uint32_t ElementCount,
                                bool GloballyCoherent, bool IsROV,
                                ResourceKind Kind) {
-  ResourceInfo RI(ResourceClass::UAV, Kind, Symbol, Name, Binding, UniqueID);
+  ResourceInfo RI(ResourceClass::UAV, Kind, Symbol, Name);
   assert(RI.isTyped() && !(RI.isStruct() || RI.isMultiSample()) &&
          "Invalid ResourceKind for UAV constructor.");
   RI.Typed.ElementTy = ElementTy;
@@ -135,11 +127,8 @@ ResourceInfo ResourceInfo::UAV(Value *Symbol, StringRef Name,
 }
 
 ResourceInfo ResourceInfo::RWRawBuffer(Value *Symbol, StringRef Name,
-                                       ResourceBinding Binding,
-                                       uint32_t UniqueID, bool GloballyCoherent,
-                                       bool IsROV) {
-  ResourceInfo RI(ResourceClass::UAV, ResourceKind::RawBuffer, Symbol, Name,
-                  Binding, UniqueID);
+                                       bool GloballyCoherent, bool IsROV) {
+  ResourceInfo RI(ResourceClass::UAV, ResourceKind::RawBuffer, Symbol, Name);
   RI.UAVFlags.GloballyCoherent = GloballyCoherent;
   RI.UAVFlags.IsROV = IsROV;
   RI.UAVFlags.HasCounter = false;
@@ -147,13 +136,11 @@ ResourceInfo ResourceInfo::RWRawBuffer(Value *Symbol, StringRef Name,
 }
 
 ResourceInfo ResourceInfo::RWStructuredBuffer(Value *Symbol, StringRef Name,
-                                              ResourceBinding Binding,
-                                              uint32_t UniqueID,
                                               uint32_t Stride, Align Alignment,
                                               bool GloballyCoherent, bool IsROV,
                                               bool HasCounter) {
   ResourceInfo RI(ResourceClass::UAV, ResourceKind::StructuredBuffer, Symbol,
-                  Name, Binding, UniqueID);
+                  Name);
   RI.Struct.Stride = Stride;
   RI.Struct.Alignment = Alignment;
   RI.UAVFlags.GloballyCoherent = GloballyCoherent;
@@ -162,13 +149,12 @@ ResourceInfo ResourceInfo::RWStructuredBuffer(Value *Symbol, StringRef Name,
   return RI;
 }
 
-ResourceInfo
-ResourceInfo::RWTexture2DMS(Value *Symbol, StringRef Name,
-                            ResourceBinding Binding, uint32_t UniqueID,
-                            ElementType ElementTy, uint32_t ElementCount,
-                            uint32_t SampleCount, bool GloballyCoherent) {
-  ResourceInfo RI(ResourceClass::UAV, ResourceKind::Texture2DMS, Symbol, Name,
-                  Binding, UniqueID);
+ResourceInfo ResourceInfo::RWTexture2DMS(Value *Symbol, StringRef Name,
+                                         ElementType ElementTy,
+                                         uint32_t ElementCount,
+                                         uint32_t SampleCount,
+                                         bool GloballyCoherent) {
+  ResourceInfo RI(ResourceClass::UAV, ResourceKind::Texture2DMS, Symbol, Name);
   RI.Typed.ElementTy = ElementTy;
   RI.Typed.ElementCount = ElementCount;
   RI.UAVFlags.GloballyCoherent = GloballyCoherent;
@@ -178,13 +164,13 @@ ResourceInfo::RWTexture2DMS(Value *Symbol, StringRef Name,
   return RI;
 }
 
-ResourceInfo
-ResourceInfo::RWTexture2DMSArray(Value *Symbol, StringRef Name,
-                                 ResourceBinding Binding, uint32_t UniqueID,
-                                 ElementType ElementTy, uint32_t ElementCount,
-                                 uint32_t SampleCount, bool GloballyCoherent) {
+ResourceInfo ResourceInfo::RWTexture2DMSArray(Value *Symbol, StringRef Name,
+                                              ElementType ElementTy,
+                                              uint32_t ElementCount,
+                                              uint32_t SampleCount,
+                                              bool GloballyCoherent) {
   ResourceInfo RI(ResourceClass::UAV, ResourceKind::Texture2DMSArray, Symbol,
-                  Name, Binding, UniqueID);
+                  Name);
   RI.Typed.ElementTy = ElementTy;
   RI.Typed.ElementCount = ElementCount;
   RI.UAVFlags.GloballyCoherent = GloballyCoherent;
@@ -195,11 +181,9 @@ ResourceInfo::RWTexture2DMSArray(Value *Symbol, StringRef Name,
 }
 
 ResourceInfo ResourceInfo::FeedbackTexture2D(Value *Symbol, StringRef Name,
-                                             ResourceBinding Binding,
-                                             uint32_t UniqueID,
                                              SamplerFeedbackType FeedbackTy) {
   ResourceInfo RI(ResourceClass::UAV, ResourceKind::FeedbackTexture2D, Symbol,
-                  Name, Binding, UniqueID);
+                  Name);
   RI.UAVFlags.GloballyCoherent = false;
   RI.UAVFlags.IsROV = false;
   RI.UAVFlags.HasCounter = false;
@@ -209,10 +193,9 @@ ResourceInfo ResourceInfo::FeedbackTexture2D(Value *Symbol, StringRef Name,
 
 ResourceInfo
 ResourceInfo::FeedbackTexture2DArray(Value *Symbol, StringRef Name,
-                                     ResourceBinding Binding, uint32_t UniqueID,
                                      SamplerFeedbackType FeedbackTy) {
   ResourceInfo RI(ResourceClass::UAV, ResourceKind::FeedbackTexture2DArray,
-                  Symbol, Name, Binding, UniqueID);
+                  Symbol, Name);
   RI.UAVFlags.GloballyCoherent = false;
   RI.UAVFlags.IsROV = false;
   RI.UAVFlags.HasCounter = false;
@@ -221,27 +204,22 @@ ResourceInfo::FeedbackTexture2DArray(Value *Symbol, StringRef Name,
 }
 
 ResourceInfo ResourceInfo::CBuffer(Value *Symbol, StringRef Name,
-                                   ResourceBinding Binding, uint32_t UniqueID,
                                    uint32_t Size) {
-  ResourceInfo RI(ResourceClass::CBuffer, ResourceKind::CBuffer, Symbol, Name,
-                  Binding, UniqueID);
+  ResourceInfo RI(ResourceClass::CBuffer, ResourceKind::CBuffer, Symbol, Name);
   RI.CBufferSize = Size;
   return RI;
 }
 
 ResourceInfo ResourceInfo::Sampler(Value *Symbol, StringRef Name,
-                                   ResourceBinding Binding, uint32_t UniqueID,
                                    SamplerType SamplerTy) {
-  ResourceInfo RI(ResourceClass::Sampler, ResourceKind::Sampler, Symbol, Name,
-                  Binding, UniqueID);
+  ResourceInfo RI(ResourceClass::Sampler, ResourceKind::Sampler, Symbol, Name);
   RI.SamplerTy = SamplerTy;
   return RI;
 }
 
 bool ResourceInfo::operator==(const ResourceInfo &RHS) const {
-  if (std::tie(Symbol, Name, Binding, UniqueID, RC, Kind) !=
-      std::tie(RHS.Symbol, RHS.Name, RHS.Binding, RHS.UniqueID, RHS.RC,
-               RHS.Kind))
+  if (std::tie(Symbol, Name, Binding, RC, Kind) !=
+      std::tie(RHS.Symbol, RHS.Name, RHS.Binding, RHS.RC, RHS.Kind))
     return false;
   if (isCBuffer())
     return CBufferSize == RHS.CBufferSize;
@@ -278,7 +256,7 @@ MDTuple *ResourceInfo::getAsMetadata(LLVMContext &Ctx) const {
         Constant::getIntegerValue(I1Ty, APInt(1, V)));
   };
 
-  MDVals.push_back(getIntMD(UniqueID));
+  MDVals.push_back(getIntMD(Binding.UniqueID));
   MDVals.push_back(ValueAsMetadata::get(Symbol));
   MDVals.push_back(MDString::get(Ctx, Name));
   MDVals.push_back(getIntMD(Binding.Space));
diff --git a/llvm/unittests/Analysis/DXILResourceTest.cpp b/llvm/unittests/Analysis/DXILResourceTest.cpp
index 036ba40f275d0..554cbd0d8ded7 100644
--- a/llvm/unittests/Analysis/DXILResourceTest.cpp
+++ b/llvm/unittests/Analysis/DXILResourceTest.cpp
@@ -113,9 +113,8 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   // ByteAddressBuffer Buffer0;
   Value *Symbol = UndefValue::get(
       StructType::create(Context, {Int32Ty}, "struct.ByteAddressBuffer"));
-  ResourceInfo Resource =
-      ResourceInfo::RawBuffer(Symbol, "Buffer0", ResourceBinding{0, 0, 1},
-                              /*UniqueID=*/0);
+  ResourceInfo Resource = ResourceInfo::RawBuffer(Symbol, "Buffer0");
+  Resource.bind(0, 0, 0, 1);
   std::pair<uint32_t, uint32_t> Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x0000000bU);
   EXPECT_EQ(Props.second, 0U);
@@ -125,9 +124,10 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   // RWByteAddressBuffer BufferOut : register(u3, space2);
   Symbol = UndefValue::get(
       StructType::create(Context, {Int32Ty}, "struct.RWByteAddressBuffer"));
-  Resource = ResourceInfo::RWRawBuffer(
-      Symbol, "BufferOut", ResourceBinding{2, 3, 1}, /*UniqueID=*/1,
-      /*GloballyCoherent=*/false, /*IsROV=*/false);
+  Resource =
+      ResourceInfo::RWRawBuffer(Symbol, "BufferOut",
+                                /*GloballyCoherent=*/false, /*IsROV=*/false);
+  Resource.bind(1, 2, 3, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x0000100bU);
   EXPECT_EQ(Props.second, 0U);
@@ -141,9 +141,9 @@ TEST(DXILResource, AnnotationsAndMetadata) {
       StructType::create(Context, {Int32Ty, FloatTy, DoubleTy}, "BufType0");
   Symbol = UndefValue::get(StructType::create(
       Context, {BufType0}, "class.StructuredBuffer<BufType>"));
-  Resource = ResourceInfo::StructuredBuffer(
-      Symbol, "Buffer0", ResourceBinding{0, 0, 1}, /*UniqueID=*/0,
-      /*Stride=*/16, Align(8));
+  Resource = ResourceInfo::StructuredBuffer(Symbol, "Buffer0",
+                                            /*Stride=*/16, Align(8));
+  Resource.bind(0, 0, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x0000030cU);
   EXPECT_EQ(Props.second, 0x00000010U);
@@ -155,9 +155,9 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   Symbol = UndefValue::get(StructType::create(
       Context, {Floatx4Ty}, "class.Texture2D<vector<float, 4> >"));
   Resource =
-      ResourceInfo::SRV(Symbol, "ColorMapTexture", ResourceBinding{0, 2, 1},
-                        /*UniqueID=*/2, dxil::ElementType::F32,
+      ResourceInfo::SRV(Symbol, "ColorMapTexture", dxil::ElementType::F32,
                         /*ElementCount=*/4, dxil::ResourceKind::Texture2D);
+  Resource.bind(2, 0, 2, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x00000002U);
   EXPECT_EQ(Props.second, 0x00000409U);
@@ -169,9 +169,9 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   Symbol = UndefValue::get(
       StructType::create(Context, {FloatTy}, "class.Texture2DMS<float, 8>"));
   Resource =
-      ResourceInfo::Texture2DMS(Symbol, "DepthBuffer", ResourceBinding{0, 0, 1},
-                                /*UniqueID=*/0, dxil::ElementType::F32,
+      ResourceInfo::Texture2DMS(Symbol, "DepthBuffer", dxil::ElementType::F32,
                                 /*ElementCount=*/1, /*SampleCount=*/8);
+  Resource.bind(0, 0, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x00000003U);
   EXPECT_EQ(Props.second, 0x00080109U);
@@ -182,9 +182,9 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   // FeedbackTexture2D<SAMPLER_FEEDBACK_MIN_MIP> feedbackMinMip;
   Symbol = UndefValue::get(
       StructType::create(Context, {Int32Ty}, "class.FeedbackTexture2D<0>"));
-  Resource = ResourceInfo::FeedbackTexture2D(
-      Symbol, "feedbackMinMip", ResourceBinding{0, 0, 1},
-      /*UniqueID=*/0, SamplerFeedbackType::MinMip);
+  Resource = ResourceInfo::FeedbackTexture2D(Symbol, "feedbackMinMip",
+                                             SamplerFeedbackType::MinMip);
+  Resource.bind(0, 0, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x00001011U);
   EXPECT_EQ(Props.second, 0U);
@@ -196,8 +196,8 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   Symbol = UndefValue::get(StructType::create(
       Context, {Int32Ty}, "class.FeedbackTexture2DArray<1>"));
   Resource = ResourceInfo::FeedbackTexture2DArray(
-      Symbol, "feedbackMipRegion", ResourceBinding{0, 0, 1},
-      /*UniqueID=*/0, SamplerFeedbackType::MipRegionUsed);
+      Symbol, "feedbackMipRegion", SamplerFeedbackType::MipRegionUsed);
+  Resource.bind(0, 0, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x00001012U);
   EXPECT_EQ(Props.second, 0x00000001U);
@@ -208,11 +208,10 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   // globallycoherent RWTexture2D<int2> OutputTexture : register(u0, space2);
   Symbol = UndefValue::get(StructType::create(
       Context, {Int32x2Ty}, "class.RWTexture2D<vector<int, 2> >"));
-  Resource =
-      ResourceInfo::UAV(Symbol, "OutputTexture", ResourceBinding{2, 0, 1},
-                        /*UniqueID=*/0, dxil::ElementType::I32,
-                        /*ElementCount=*/2, /*GloballyCoherent=*/1, /*IsROV=*/0,
-                        dxil::ResourceKind::Texture2D);
+  Resource = ResourceInfo::UAV(Symbol, "OutputTexture", dxil::ElementType::I32,
+                               /*ElementCount=*/2, /*GloballyCoherent=*/1,
+                               /*IsROV=*/0, dxil::ResourceKind::Texture2D);
+  Resource.bind(0, 2, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x00005002U);
   EXPECT_EQ(Props.second, 0x00000204U);
@@ -224,10 +223,10 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   Symbol = UndefValue::get(
       StructType::create(Context, {Floatx4Ty},
                          "class.RasterizerOrderedBuffer<vector<float, 4> >"));
-  Resource = ResourceInfo::UAV(Symbol, "ROB", ResourceBinding{0, 0, 1},
-                               /*UniqueID=*/0, dxil::ElementType::F32,
+  Resource = ResourceInfo::UAV(Symbol, "ROB", dxil::ElementType::F32,
                                /*ElementCount=*/4, /*GloballyCoherent=*/0,
                                /*IsROV=*/1, dxil::ResourceKind::TypedBuffer);
+  Resource.bind(0, 0, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x0000300aU);
   EXPECT_EQ(Props.second, 0x00000409U);
@@ -240,10 +239,11 @@ TEST(DXILResource, AnnotationsAndMetadata) {
       Context, {Floatx3Ty, FloatTy, Int32Ty}, "ParticleMotion");
   Symbol = UndefValue::get(StructType::create(
       Context, {BufType1}, "class.StructuredBuffer<ParticleMotion>"));
-  Resource = ResourceInfo::RWStructuredBuffer(
-      Symbol, "g_OutputBuffer", ResourceBinding{0, 2, 1},
-      /*UniqueID=*/0, /*Stride=*/20, Align(4), /*GloballyCoherent=*/false,
-      /*IsROV=*/false, /*HasCounter=*/true);
+  Resource =
+      ResourceInfo::RWStructuredBuffer(Symbol, "g_OutputBuffer", /*Stride=*/20,
+                                       Align(4), /*GloballyCoherent=*/false,
+                                       /*IsROV=*/false, /*HasCounter=*/true);
+  Resource.bind(0, 0, 2, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x0000920cU);
   EXPECT_EQ(Props.second, 0x00000014U);
@@ -255,9 +255,9 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   Symbol = UndefValue::get(StructType::create(
       Context, {Int32Ty}, "class.RWTexture2DMSArray<unsigned int, 8>"));
   Resource = ResourceInfo::RWTexture2DMSArray(
-      Symbol, "g_rw_t2dmsa", ResourceBinding{0, 0, 1},
-      /*UniqueID=*/0, dxil::ElementType::U32, /*ElementCount=*/1,
+      Symbol, "g_rw_t2dmsa", dxil::ElementType::U32, /*ElementCount=*/1,
       /*SampleCount=*/8, /*GloballyCoherent=*/false);
+  Resource.bind(0, 0, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x00001008U);
   EXPECT_EQ(Props.second, 0x00080105U);
@@ -268,8 +268,8 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   // cbuffer cb0 { float4 g_X; float4 g_Y; }
   Symbol = UndefValue::get(
       StructType::create(Context, {Floatx4Ty, Floatx4Ty}, "cb0"));
-  Resource = ResourceInfo::CBuffer(Symbol, "cb0", ResourceBinding{0, 0, 1},
-                                   /*UniqueID=*/0, /*Size=*/32);
+  Resource = ResourceInfo::CBuffer(Symbol, "cb0", /*Size=*/32);
+  Resource.bind(0, 0, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x0000000dU);
   EXPECT_EQ(Props.second, 0x00000020U);
@@ -279,9 +279,9 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   // SamplerState ColorMapSampler : register(s0);
   Symbol = UndefValue::get(
       StructType::create(Context, {Int32Ty}, "struct.SamplerState"));
-  Resource =
-      ResourceInfo::Sampler(Symbol, "ColorMapSampler", ResourceBinding{0, 0, 1},
-                            /*UniqueID=*/0, dxil::SamplerType::Default);
+  Resource = ResourceInfo::Sampler(Symbol, "ColorMapSampler",
+                                   dxil::SamplerType::Default);
+  Resource.bind(0, 0, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x0000000eU);
   EXPECT_EQ(Props.second, 0U);
@@ -290,9 +290,9 @@ TEST(DXILResource, AnnotationsAndMetadata) {
               TestMD.get(0, Symbol, "ColorMapSampler", 0, 0, 1, 0, nullptr));
 
   // SamplerComparisonState ShadowSampler {...};
-  Resource =
-      ResourceInfo::Sampler(Symbol, "CmpSampler", ResourceBinding{0, 0, 1},
-                            /*UniqueID=*/0, dxil::SamplerType::Comparison);
+  Resource = ResourceInfo::Sampler(Symbol, "CmpSampler",
+                                   dxil::SamplerType::Comparison);
+  Resource.bind(0, 0, 0, 1);
   Props = Resource.getAnnotateProps();
   EXPECT_EQ(Props.first, 0x0000800eU);
   EXPECT_EQ(Props.second, 0U);

>From 8470a23c488ffa0fe3a0ed6fe776b9bb6336897b Mon Sep 17 00:00:00 2001
From: nicebert <110385235+nicebert at users.noreply.github.com>
Date: Thu, 25 Jul 2024 21:30:14 +0200
Subject: [PATCH 193/259] [OpenMP] Defaultmap: fixes scalar issue, adds all
 variable category (#99315)

Fixes issue with defaultmap where scalar isn't handled correctly for
present modifier. Adds all variable cateogry introduced in OpenMP 5.2
and alters existing tests for error messages to check OpenMP 5.2
defaultmap messages.
---
 clang/include/clang/Basic/OpenMPKinds.def     |  1 +
 clang/lib/Parse/ParseOpenMP.cpp               |  1 +
 clang/lib/Sema/SemaOpenMP.cpp                 | 25 +++++++++--------
 .../OpenMP/target_defaultmap_messages.cpp     | 27 ++++++++++---------
 .../target_parallel_defaultmap_messages.cpp   | 23 +++++++++-------
 ...arget_parallel_for_defaultmap_messages.cpp | 27 ++++++++++---------
 ..._parallel_for_simd_defaultmap_messages.cpp | 27 ++++++++++---------
 .../target_simd_defaultmap_messages.cpp       | 27 ++++++++++---------
 .../target_teams_defaultmap_messages.cpp      | 27 ++++++++++---------
 ...t_teams_distribute_defaultmap_messages.cpp | 27 ++++++++++---------
 ...ibute_parallel_for_defaultmap_messages.cpp | 27 ++++++++++---------
 ..._parallel_for_simd_defaultmap_messages.cpp | 27 ++++++++++---------
 12 files changed, 149 insertions(+), 117 deletions(-)

diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def
index f46a92d5ecfd4..51084913bf102 100644
--- a/clang/include/clang/Basic/OpenMPKinds.def
+++ b/clang/include/clang/Basic/OpenMPKinds.def
@@ -107,6 +107,7 @@ OPENMP_DEVICE_MODIFIER(device_num)
 OPENMP_DEFAULTMAP_KIND(scalar)
 OPENMP_DEFAULTMAP_KIND(aggregate)
 OPENMP_DEFAULTMAP_KIND(pointer)
+OPENMP_DEFAULTMAP_KIND(all)
 
 // Modifiers for 'defaultmap' clause.
 OPENMP_DEFAULTMAP_MODIFIER(alloc)
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index f5b44d210680c..e975e96c5c7e4 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -3837,6 +3837,7 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind,
     // Get a defaultmap modifier
     unsigned Modifier = getOpenMPSimpleClauseType(
         Kind, Tok.isAnnotation() ? "" : PP.getSpelling(Tok), getLangOpts());
+
     // Set defaultmap modifier to unknown if it is either scalar, aggregate, or
     // pointer
     if (Modifier < OMPC_DEFAULTMAP_MODIFIER_unknown)
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index bb18c558c49a6..9c80b3eec914c 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -801,7 +801,8 @@ class DSAStackTy {
       return (M == OMPC_DEFAULTMAP_MODIFIER_alloc) ||
              (M == OMPC_DEFAULTMAP_MODIFIER_to) ||
              (M == OMPC_DEFAULTMAP_MODIFIER_from) ||
-             (M == OMPC_DEFAULTMAP_MODIFIER_tofrom);
+             (M == OMPC_DEFAULTMAP_MODIFIER_tofrom) ||
+             (M == OMPC_DEFAULTMAP_MODIFIER_present);
     }
     return true;
   }
@@ -3078,11 +3079,11 @@ ExprResult SemaOpenMP::ActOnOpenMPIdExpression(Scope *CurScope,
     if (TypoCorrection Corrected =
             SemaRef.CorrectTypo(Id, Sema::LookupOrdinaryName, CurScope, nullptr,
                                 CCC, Sema::CTK_ErrorRecovery)) {
-      SemaRef.diagnoseTypo(Corrected,
-                           PDiag(Lookup.empty()
-                                     ? diag::err_undeclared_var_use_suggest
-                                     : diag::err_omp_expected_var_arg_suggest)
-                               << Id.getName());
+      SemaRef.diagnoseTypo(
+          Corrected,
+          SemaRef.PDiag(Lookup.empty() ? diag::err_undeclared_var_use_suggest
+                                       : diag::err_omp_expected_var_arg_suggest)
+              << Id.getName());
       VD = Corrected.getCorrectionDeclAs<VarDecl>();
     } else {
       Diag(Id.getLoc(), Lookup.empty() ? diag::err_undeclared_var_use
@@ -7500,9 +7501,9 @@ SemaOpenMP::checkOpenMPDeclareVariantFunction(SemaOpenMP::DeclGroupPtrTy DG,
                               PartialDiagnostic::NullDiagnostic()),
           PartialDiagnosticAt(
               VariantRef->getExprLoc(),
-              PDiag(diag::err_omp_declare_variant_doesnt_support)),
+              SemaRef.PDiag(diag::err_omp_declare_variant_doesnt_support)),
           PartialDiagnosticAt(VariantRef->getExprLoc(),
-                              PDiag(diag::err_omp_declare_variant_diff)
+                              SemaRef.PDiag(diag::err_omp_declare_variant_diff)
                                   << FD->getLocation()),
           /*TemplatesSupported=*/true, /*ConstexprSupported=*/false,
           /*CLinkageMayDiffer=*/true))
@@ -21892,7 +21893,9 @@ OMPClause *SemaOpenMP::ActOnOpenMPDefaultmapClause(
     bool isDefaultmapKind = (Kind != OMPC_DEFAULTMAP_unknown) ||
                             (getLangOpts().OpenMP >= 50 && KindLoc.isInvalid());
     if (!isDefaultmapKind || !isDefaultmapModifier) {
-      StringRef KindValue = "'scalar', 'aggregate', 'pointer'";
+      StringRef KindValue = getLangOpts().OpenMP < 52
+                                ? "'scalar', 'aggregate', 'pointer'"
+                                : "'scalar', 'aggregate', 'pointer', 'all'";
       if (getLangOpts().OpenMP == 50) {
         StringRef ModifierValue = "'alloc', 'from', 'to', 'tofrom', "
                                   "'firstprivate', 'none', 'default'";
@@ -21936,7 +21939,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPDefaultmapClause(
       return nullptr;
     }
   }
-  if (Kind == OMPC_DEFAULTMAP_unknown) {
+  if (Kind == OMPC_DEFAULTMAP_unknown || Kind == OMPC_DEFAULTMAP_all) {
     // Variable category is not specified - mark all categories.
     DSAStack->setDefaultDMAAttr(M, OMPC_DEFAULTMAP_aggregate, StartLoc);
     DSAStack->setDefaultDMAAttr(M, OMPC_DEFAULTMAP_scalar, StartLoc);
@@ -22009,7 +22012,7 @@ NamedDecl *SemaOpenMP::lookupOpenMPDeclareTargetName(
             SemaRef.CorrectTypo(Id, Sema::LookupOrdinaryName, CurScope, nullptr,
                                 CCC, Sema::CTK_ErrorRecovery)) {
       SemaRef.diagnoseTypo(Corrected,
-                           PDiag(diag::err_undeclared_var_use_suggest)
+                           SemaRef.PDiag(diag::err_undeclared_var_use_suggest)
                                << Id.getName());
       checkDeclIsAllowedInOpenMPTarget(nullptr, Corrected.getCorrectionDecl());
       return nullptr;
diff --git a/clang/test/OpenMP/target_defaultmap_messages.cpp b/clang/test/OpenMP/target_defaultmap_messages.cpp
index 8052e34d5c933..88ae3b7962d55 100644
--- a/clang/test/OpenMP/target_defaultmap_messages.cpp
+++ b/clang/test/OpenMP/target_defaultmap_messages.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -verify -Wno-vla -fopenmp %s -verify=expected,omp51 -Wuninitialized -DOMP51
-// RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd %s -verify=expected,omp51 -Wuninitialized -DOMP51
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized -DOMP51
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized -DOMP51
 
 // RUN: %clang_cc1 -verify -Wno-vla -fopenmp -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
 // RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
@@ -30,23 +33,23 @@ template <class T, typename S, int N, int ST>
 T tmain(T argc, S **argv) {
   #pragma omp target defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   foo();
-#pragma omp target defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  #pragma omp target defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} 
   foo();
 #pragma omp target defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target defaultmap (scalar: // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  #pragma omp target defaultmap (scalar: // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target defaultmap(tofrom:scalar) defaultmap(tofrom:scalar) // omp45-error {{directive '#pragma omp target' cannot contain more than one 'defaultmap' clause}} omp5-error {{at most one defaultmap clause for each variable-category can appear on the directive}} omp51-error {{at most one defaultmap clause for each variable-category can appear on the directive}}
+  #pragma omp target defaultmap(tofrom:scalar) defaultmap(tofrom:scalar) // omp45-error {{directive '#pragma omp target' cannot contain more than one 'defaultmap' clause}} omp5-error {{at most one defaultmap clause for each variable-category can appear on the directive}} omp5x-error {{at most one defaultmap clause for each variable-category can appear on the directive}}
 
   foo();
 
@@ -93,23 +96,23 @@ T tmain(T argc, S **argv) {
 int main(int argc, char **argv) {
 #pragma omp target defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   foo();
-#pragma omp target defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target defaultmap(tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+#pragma omp target defaultmap(tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target defaultmap(scalar: // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target defaultmap(scalar: // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target defaultmap(tofrom: scalar) defaultmap(tofrom: scalar) // omp45-error {{directive '#pragma omp target' cannot contain more than one 'defaultmap' clause}} omp5-error {{at most one defaultmap clause for each variable-category can appear on the directive}} omp51-error {{at most one defaultmap clause for each variable-category can appear on the directive}}
+#pragma omp target defaultmap(tofrom: scalar) defaultmap(tofrom: scalar) // omp45-error {{directive '#pragma omp target' cannot contain more than one 'defaultmap' clause}} omp5-error {{at most one defaultmap clause for each variable-category can appear on the directive}} omp5x-error {{at most one defaultmap clause for each variable-category can appear on the directive}}
   foo();
 
 #ifdef OMP5
diff --git a/clang/test/OpenMP/target_parallel_defaultmap_messages.cpp b/clang/test/OpenMP/target_parallel_defaultmap_messages.cpp
index 4c226e9ca2aef..3c628c4f3e172 100644
--- a/clang/test/OpenMP/target_parallel_defaultmap_messages.cpp
+++ b/clang/test/OpenMP/target_parallel_defaultmap_messages.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -verify -fopenmp %s -verify=expected,omp51 -Wuninitialized -Wno-vla -DOMP51
-// RUN: %clang_cc1 -verify -fopenmp-simd %s -verify=expected,omp51 -Wuninitialized -Wno-vla -DOMP51
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized -DOMP51
+// RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized -DOMP51
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -Wno-vla -DOMP5
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -Wno-vla -DOMP5
@@ -28,19 +31,19 @@ void foo() {
 
 template <class T, typename S, int N, int ST>
 T tmain(T argc, S **argv) {
-#pragma omp target parallel defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target parallel defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target parallel defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target parallel defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target parallel defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target parallel defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target parallel defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel defaultmap (scalar: // expected-error {{expected ')'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target parallel defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
@@ -90,19 +93,19 @@ T tmain(T argc, S **argv) {
 }
 
 int main(int argc, char **argv) {
-#pragma omp target parallel defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target parallel defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target parallel defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target parallel defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target parallel defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target parallel defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target parallel defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target parallel defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
diff --git a/clang/test/OpenMP/target_parallel_for_defaultmap_messages.cpp b/clang/test/OpenMP/target_parallel_for_defaultmap_messages.cpp
index 8b58bb9111c81..b142ae1733a3c 100644
--- a/clang/test/OpenMP/target_parallel_for_defaultmap_messages.cpp
+++ b/clang/test/OpenMP/target_parallel_for_defaultmap_messages.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -verify -fopenmp %s -verify=expected,omp51 -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp-simd %s -verify=expected,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
@@ -15,21 +18,21 @@ T tmain(T argc, S **argv) {
   int i;
   #pragma omp target parallel for defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for defaultmap( // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel for defaultmap( // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for defaultmap() // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel for defaultmap() // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-  #pragma omp target parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp51-error {{expected ')'}} omp5-note {{to match this '('}} omp51-note {{to match this '('}}
+#pragma omp target parallel for defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5x-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-  #pragma omp target parallel for defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel for defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
@@ -41,21 +44,21 @@ int main(int argc, char **argv) {
   int i;
   #pragma omp target parallel for defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for defaultmap( // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel for defaultmap( // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for defaultmap() // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel for defaultmap() // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-  #pragma omp target parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp51-error {{expected ')'}} omp5-note {{to match this '('}} omp51-note {{to match this '('}}
+#pragma omp target parallel for defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5x-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-  #pragma omp target parallel for defaultmap (scalar: // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel for defaultmap (scalar: // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
diff --git a/clang/test/OpenMP/target_parallel_for_simd_defaultmap_messages.cpp b/clang/test/OpenMP/target_parallel_for_simd_defaultmap_messages.cpp
index 7d72547d24e1e..46f78290cba4b 100644
--- a/clang/test/OpenMP/target_parallel_for_simd_defaultmap_messages.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_defaultmap_messages.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -verify -fopenmp %s -verify=expected,omp51 -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp-simd %s -verify=expected,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
@@ -15,21 +18,21 @@ T tmain(T argc, S **argv) {
   int i;
   #pragma omp target parallel for simd defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for simd defaultmap( // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel for simd defaultmap( // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for simd defaultmap() // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel for simd defaultmap() // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for simd defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-  #pragma omp target parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for simd defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target parallel for simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for simd defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-  #pragma omp target parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for simd defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
@@ -41,21 +44,21 @@ int main(int argc, char **argv) {
   int i;
   #pragma omp target parallel for simd defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for simd defaultmap( // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel for simd defaultmap( // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for simd defaultmap() // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target parallel for simd defaultmap() // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for simd defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-  #pragma omp target parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for simd defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target parallel for simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target parallel for simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for simd defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-  #pragma omp target parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  #pragma omp target parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target parallel for simd defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
diff --git a/clang/test/OpenMP/target_simd_defaultmap_messages.cpp b/clang/test/OpenMP/target_simd_defaultmap_messages.cpp
index 0fa86cc7279ad..2169681e05749 100644
--- a/clang/test/OpenMP/target_simd_defaultmap_messages.cpp
+++ b/clang/test/OpenMP/target_simd_defaultmap_messages.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -verify -fopenmp %s -verify=expected,omp51 -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp-simd %s -verify=expected,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
@@ -15,24 +18,24 @@ T tmain(T argc, S **argv) {
   int i;
   #pragma omp target simd defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target simd defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target simd defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target simd defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target simd defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target simd defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target simd defaultmap(tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}}
+#pragma omp target simd defaultmap(tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} 
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target simd defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target simd defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-  #pragma omp target simd defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}}
+  #pragma omp target simd defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} 
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target simd defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
@@ -46,28 +49,28 @@ int main(int argc, char **argv) {
 #pragma omp target simd defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target simd defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target simd defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target simd defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target simd defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target simd defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target simd defaultmap(tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}}
+#pragma omp target simd defaultmap(tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} 
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target simd defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target simd defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target simd defaultmap(scalar: // expected-error {{expected ')'}} omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}}
+#pragma omp target simd defaultmap(scalar: // expected-error {{expected ')'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} 
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target simd defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
diff --git a/clang/test/OpenMP/target_teams_defaultmap_messages.cpp b/clang/test/OpenMP/target_teams_defaultmap_messages.cpp
index 206a1e4f18af2..97ed5315c4c8e 100644
--- a/clang/test/OpenMP/target_teams_defaultmap_messages.cpp
+++ b/clang/test/OpenMP/target_teams_defaultmap_messages.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -verify -fopenmp %s -verify=expected,omp51 -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp-simd %s -verify=expected,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
@@ -14,21 +17,21 @@ template <class T, typename S, int N, int ST>
 T tmain(T argc, S **argv) {
   #pragma omp target teams defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   foo();
-#pragma omp target teams defaultmap( // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams defaultmap( // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target teams defaultmap() // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams defaultmap() // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target teams defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target teams defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  #pragma omp target teams defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target teams defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target teams defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target teams defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   foo();
 #pragma omp target teams defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target teams defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  #pragma omp target teams defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target teams defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
@@ -39,21 +42,21 @@ T tmain(T argc, S **argv) {
 int main(int argc, char **argv) {
   #pragma omp target teams defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   foo();
-#pragma omp target teams defaultmap( // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams defaultmap( // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target teams defaultmap() // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams defaultmap() // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target teams defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target teams defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  #pragma omp target teams defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target teams defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-#pragma omp target teams defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target teams defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   foo();
 #pragma omp target teams defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
-  #pragma omp target teams defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  #pragma omp target teams defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   foo();
 #pragma omp target teams defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   foo();
diff --git a/clang/test/OpenMP/target_teams_distribute_defaultmap_messages.cpp b/clang/test/OpenMP/target_teams_distribute_defaultmap_messages.cpp
index 84d5b198c2878..3e3392d94af98 100644
--- a/clang/test/OpenMP/target_teams_distribute_defaultmap_messages.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_defaultmap_messages.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -verify -fopenmp %s -verify=expected,omp51 -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp-simd %s -verify=expected,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
@@ -15,21 +18,21 @@ T tmain(T argc, S **argv) {
   int i;
 #pragma omp target teams distribute defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target teams distribute defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
@@ -41,21 +44,21 @@ int main(int argc, char **argv) {
   int i;
 #pragma omp target teams distribute defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target teams distribute defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_defaultmap_messages.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_defaultmap_messages.cpp
index 0bb519be21740..b789c39dfad6b 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_defaultmap_messages.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_defaultmap_messages.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -verify -fopenmp %s -verify=expected,omp51 -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp-simd %s -verify=expected,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
@@ -15,21 +18,21 @@ T tmain(T argc, S **argv) {
   int i;
 #pragma omp target teams distribute parallel for defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target teams distribute parallel for defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
@@ -41,21 +44,21 @@ int main(int argc, char **argv) {
   int i;
 #pragma omp target teams distribute parallel for defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap( // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for defaultmap( // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap() // omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for defaultmap() // omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target teams distribute parallel for defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_defaultmap_messages.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_defaultmap_messages.cpp
index 0eeca55a9dcd2..ab3ea1b283d3b 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_defaultmap_messages.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_defaultmap_messages.cpp
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -verify -fopenmp %s -verify=expected,omp51 -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp-simd %s -verify=expected,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=52 -verify=expected,omp5x,omp52 -Wuninitialized
+
+// RUN: %clang_cc1 -verify -fopenmp %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -fopenmp-version=51 -verify=expected,omp5x,omp51 -Wuninitialized
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 %s -verify=expected,omp5 -Wuninitialized -DOMP5
@@ -15,22 +18,22 @@ T tmain(T argc, S **argv) {
   int i;
 #pragma omp target teams distribute parallel for simd defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for simd defaultmap( // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for simd defaultmap( // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for simd defaultmap() // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for simd defaultmap() // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for simd defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
 #pragma omp target teams distribute parallel for simd defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target teams distribute parallel for simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target teams distribute parallel for simd defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target teams distribute parallel for simd defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
@@ -44,27 +47,27 @@ int main(int argc, char **argv) {
 #pragma omp target teams distribute parallel for simd defaultmap // expected-error {{expected '(' after 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target teams distribute parallel for simd defaultmap( // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for simd defaultmap( // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target teams distribute parallel for simd defaultmap() // omp51-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for simd defaultmap() // omp5x-error {{'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target teams distribute parallel for simd defaultmap(tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target teams distribute parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target teams distribute parallel for simd defaultmap(tofrom) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
-#pragma omp target teams distribute parallel for simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp51-error {{expected ')'}} omp51-note {{to match this '('}}
+#pragma omp target teams distribute parallel for simd defaultmap(tofrom scalar) // omp45-warning {{missing ':' after defaultmap modifier - ignoring}} omp5-error {{expected ')'}} omp5-note {{to match this '('}} omp5x-error {{expected ')'}} omp5x-note {{to match this '('}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target teams distribute parallel for simd defaultmap(tofrom, // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i) foo();
-#pragma omp target teams distribute parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} omp51-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+#pragma omp target teams distribute parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} omp5x-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default', 'present' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'alloc', 'from', 'to', 'tofrom', 'firstprivate', 'none', 'default' in OpenMP clause 'defaultmap'}} omp52-error {{expected 'scalar', 'aggregate', 'pointer', 'all' in OpenMP clause 'defaultmap'}} omp51-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} omp5-error {{expected 'scalar', 'aggregate', 'pointer' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}} omp45-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target teams distribute parallel for simd defaultmap(tofrom, scalar // expected-error {{expected ')'}} omp45-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}} omp45-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}

>From a737b8704c031310460d492cef90eee5054cabd7 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Thu, 25 Jul 2024 15:52:49 -0400
Subject: [PATCH 194/259] [ctx_prof] test tool: generate ctxprof bistream from
 json (#100379)

This is a tool to simplify testing. It generates a valid contextual profile file from a json representation.

The tool is authored to allow for future evolution, e.g. if we want to support profile merging or other tasks, not necessarily scoped to testing.
---
 llvm/test/CMakeLists.txt                      |   1 +
 llvm/test/lit.cfg.py                          |   1 +
 .../tools/llvm-ctxprof-util/Inputs/bad.json   |   1 +
 .../tools/llvm-ctxprof-util/Inputs/empty.json |   1 +
 .../Inputs/invalid-bad-subctx.json            |   8 +
 .../Inputs/invalid-no-counters.json           |   5 +
 .../Inputs/invalid-no-ctx.json                |   1 +
 .../Inputs/invalid-no-vector.json             |   1 +
 .../tools/llvm-ctxprof-util/Inputs/valid.json |  47 ++++++
 .../llvm-ctxprof-util-negative.test           |  22 +++
 .../llvm-ctxprof-util/llvm-ctxprof-util.test  |  46 ++++++
 llvm/tools/CMakeLists.txt                     |   1 +
 llvm/tools/llvm-ctxprof-util/CMakeLists.txt   |  14 ++
 .../llvm-ctxprof-util/llvm-ctxprof-util.cpp   | 150 ++++++++++++++++++
 14 files changed, 299 insertions(+)
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/bad.json
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/empty.json
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-bad-subctx.json
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-counters.json
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-ctx.json
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-vector.json
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/valid.json
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test
 create mode 100644 llvm/tools/llvm-ctxprof-util/CMakeLists.txt
 create mode 100644 llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp

diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 6b7f2b58e603e..8abc153336251 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -76,6 +76,7 @@ set(LLVM_TEST_DEPENDS
           llvm-cfi-verify
           llvm-config
           llvm-cov
+          llvm-ctxprof-util
           llvm-cvtres
           llvm-cxxdump
           llvm-cxxfilt
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index fe1262893212f..e5e3dc7e1b4bd 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -182,6 +182,7 @@ def get_asan_rtlib():
         "llvm-bitcode-strip",
         "llvm-config",
         "llvm-cov",
+        "llvm-ctxprof-util",
         "llvm-cxxdump",
         "llvm-cvtres",
         "llvm-debuginfod-find",
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/bad.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/bad.json
new file mode 100644
index 0000000000000..35c169002386e
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/bad.json
@@ -0,0 +1 @@
+[{]
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/empty.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/empty.json
new file mode 100644
index 0000000000000..fe51488c7066f
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/empty.json
@@ -0,0 +1 @@
+[]
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-bad-subctx.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-bad-subctx.json
new file mode 100644
index 0000000000000..b47e0ee1a04ba
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-bad-subctx.json
@@ -0,0 +1,8 @@
+[{
+  "Guid": 123,
+  "Counters": [1, 2],
+  "Callsites":
+  [
+    {"Guid": 1}
+  ]
+}]
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-counters.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-counters.json
new file mode 100644
index 0000000000000..95cdd45a5a0f7
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-counters.json
@@ -0,0 +1,5 @@
+[
+  {
+    "Guid": 1231
+  }
+]
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-ctx.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-ctx.json
new file mode 100644
index 0000000000000..93d51406d63fb
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-ctx.json
@@ -0,0 +1 @@
+[{}]
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-vector.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-vector.json
new file mode 100644
index 0000000000000..0967ef424bce6
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-no-vector.json
@@ -0,0 +1 @@
+{}
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.json b/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.json
new file mode 100644
index 0000000000000..d4a6da4142110
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.json
@@ -0,0 +1,47 @@
+[
+  {
+    "Guid": 1000,
+    "Counters": [
+      1,
+      2,
+      3
+    ],
+    "Callsites": [
+      [],
+      [
+        {
+          "Guid": 2000,
+          "Counters": [
+            4,
+            5
+          ]
+        },
+        {
+          "Guid": 18446744073709551613,
+          "Counters": [
+            6,
+            7,
+            8
+          ]
+        }
+      ],
+      [
+        {
+          "Guid": 3000,
+          "Counters": [
+            40,
+            50
+          ]
+        }
+      ]
+    ]
+  },
+  {
+    "Guid": 18446744073709551612,
+    "Counters": [
+      5,
+      9,
+      10
+    ]
+  }
+]
diff --git a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test
new file mode 100644
index 0000000000000..97598c11979fa
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test
@@ -0,0 +1,22 @@
+; RUN: not llvm-ctxprof-util nofile.json 2>&1 | FileCheck %s --check-prefix=NO_CMD
+; RUN: not llvm-ctxprof-util invalidCmd --input nofile.json 2>&1 | FileCheck %s --check-prefix=INVALID_CMD
+; RUN: not llvm-ctxprof-util fromJSON nofile.json 2>&1 | FileCheck %s --check-prefix=NO_FLAG
+; RUN: not llvm-ctxprof-util fromJSON --input nofile.json 2>&1 | FileCheck %s --check-prefix=NO_FILE
+; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/bad.json 2>&1 | FileCheck %s --check-prefix=BAD_JSON
+; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/invalid-no-vector.json 2>&1 | FileCheck %s --check-prefix=NO_VECTOR
+; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/invalid-no-ctx.json 2>&1 | FileCheck %s --check-prefix=NO_CTX
+; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/invalid-no-counters.json 2>&1 | FileCheck %s --check-prefix=NO_COUNTERS
+; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/invalid-bad-subctx.json 2>&1 | FileCheck %s --check-prefix=BAD_SUBCTX
+; RUN: rm -rf %t
+; RUN: not llvm-ctxprof-util fromJSON --input %S/Inputs/valid.json --output %t/output.bitstream 2>&1 | FileCheck %s --check-prefix=NO_DIR
+
+; NO_CMD: Unknown subcommand 'nofile.json'
+; INVALID_CMD: Unknown subcommand 'invalidCmd'
+; NO_FLAG: Unknown command line argument 'nofile.json'. 
+; NO_FILE: 'nofile.json': No such file or directory
+; BAD_JSON: Expected object key
+; NO_VECTOR: expected array
+; NO_CTX: missing value at (root)[0].Guid
+; NO_COUNTERS: missing value at (root)[0].Counters
+; BAD_SUBCTX: expected array at (root)[0].Callsites[0]
+; NO_DIR: failed to open output
diff --git a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test
new file mode 100644
index 0000000000000..d430818d05442
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test
@@ -0,0 +1,46 @@
+; RUN: mkdir -p %t
+; RUN: llvm-ctxprof-util fromJSON --input %S/Inputs/empty.json -output %t/empty.bitstream
+; RUN: llvm-bcanalyzer --dump %t/empty.bitstream | FileCheck %s --check-prefix=EMPTY
+
+; RUN: llvm-ctxprof-util fromJSON --input %S/Inputs/valid.json -output %t/valid.bitstream
+
+; For the valid case, check against a reference output.
+; Note that uint64_t are printed as signed values by llvm-bcanalyzer:
+;  * 18446744073709551613 in json is -3 in the output
+;  * 18446744073709551612 in json is -4 in the output
+; Also we have no callee/context at index 0, 2 callsites for index 1, and one for
+; index 2.
+; RUN: llvm-bcanalyzer --dump %t/valid.bitstream | FileCheck %s --check-prefix=VALID
+
+; EMPTY: <BLOCKINFO_BLOCK/>
+; EMPTY-NEXT: <Metadata NumWords=1 BlockCodeSize=2>
+; EMPTY-NEXT:   <Version op0=1/>
+; EMPTY-NEXT: </Metadata>
+
+; VALID:      <BLOCKINFO_BLOCK/>
+; VALID-NEXT: <Metadata NumWords=30 BlockCodeSize=2>
+; VALID-NEXT:   <Version op0=1/>
+; VALID-NEXT:   <Context NumWords=20 BlockCodeSize=2>
+; VALID-NEXT:     <GUID op0=1000/>
+; VALID-NEXT:     <Counters op0=1 op1=2 op2=3/>
+; VALID-NEXT:     <Context NumWords=5 BlockCodeSize=2>
+; VALID-NEXT:       <GUID op0=-3/>
+; VALID-NEXT:       <CalleeIndex op0=1/>
+; VALID-NEXT:       <Counters op0=6 op1=7 op2=8/>
+; VALID-NEXT:     </Context>
+; VALID-NEXT:     <Context NumWords=3 BlockCodeSize=2>
+; VALID-NEXT:       <GUID op0=2000/>
+; VALID-NEXT:       <CalleeIndex op0=1/>
+; VALID-NEXT:       <Counters op0=4 op1=5/>
+; VALID-NEXT:     </Context>
+; VALID-NEXT:     <Context NumWords=3 BlockCodeSize=2>
+; VALID-NEXT:       <GUID op0=3000/>
+; VALID-NEXT:       <CalleeIndex op0=2/>
+; VALID-NEXT:       <Counters op0=40 op1=50/>
+; VALID-NEXT:     </Context>
+; VALID-NEXT:   </Context>
+; VALID-NEXT:   <Context NumWords=4 BlockCodeSize=2>
+; VALID-NEXT:     <GUID op0=-4/>
+; VALID-NEXT:     <Counters op0=5 op1=9 op2=10/>
+; VALID-NEXT:   </Context>
+; VALID-NEXT: </Metadata>
diff --git a/llvm/tools/CMakeLists.txt b/llvm/tools/CMakeLists.txt
index db66dad5dc0db..b9c5a79849ec8 100644
--- a/llvm/tools/CMakeLists.txt
+++ b/llvm/tools/CMakeLists.txt
@@ -32,6 +32,7 @@ add_llvm_tool_subdirectory(lto)
 add_llvm_tool_subdirectory(gold)
 add_llvm_tool_subdirectory(llvm-ar)
 add_llvm_tool_subdirectory(llvm-config)
+add_llvm_tool_subdirectory(llvm-ctxprof-util)
 add_llvm_tool_subdirectory(llvm-lto)
 add_llvm_tool_subdirectory(llvm-profdata)
 
diff --git a/llvm/tools/llvm-ctxprof-util/CMakeLists.txt b/llvm/tools/llvm-ctxprof-util/CMakeLists.txt
new file mode 100644
index 0000000000000..abf8e1aa0651f
--- /dev/null
+++ b/llvm/tools/llvm-ctxprof-util/CMakeLists.txt
@@ -0,0 +1,14 @@
+set(LLVM_LINK_COMPONENTS
+  Core
+  Object
+  ProfileData
+  Support
+  )
+
+add_llvm_tool(llvm-ctxprof-util
+llvm-ctxprof-util.cpp
+
+  DEPENDS
+  intrinsics_gen
+  GENERATE_DRIVER
+  )
diff --git a/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp b/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp
new file mode 100644
index 0000000000000..ded8c8a6e4332
--- /dev/null
+++ b/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp
@@ -0,0 +1,150 @@
+//===--- PGOCtxProfJSONReader.h - JSON format  ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+///
+/// JSON format for the contextual profile for testing.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/ProfileData/CtxInstrContextNode.h"
+#include "llvm/ProfileData/PGOCtxProfWriter.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/LLVMDriver.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::SubCommand FromJSON("fromJSON", "Convert from json");
+
+static cl::opt<std::string> InputFilename(
+    "input", cl::value_desc("input"), cl::init("-"),
+    cl::desc(
+        "Input file. The format is an array of contexts.\n"
+        "Each context is a dictionary with the following keys:\n"
+        "'Guid', mandatory. The value is a 64-bit integer.\n"
+        "'Counters', mandatory. An array of 32-bit ints. These are the "
+        "counter values.\n"
+        "'Contexts', optional. An array containing arrays of contexts. The "
+        "context array at a position 'i' is the set of callees at that "
+        "callsite index. Use an empty array to indicate no callees."),
+    cl::sub(FromJSON));
+
+static cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
+                                           cl::init("-"),
+                                           cl::desc("Output file"),
+                                           cl::sub(FromJSON));
+
+namespace {
+// A structural representation of the JSON input.
+struct DeserializableCtx {
+  GlobalValue::GUID Guid = 0;
+  std::vector<uint64_t> Counters;
+  std::vector<std::vector<DeserializableCtx>> Callsites;
+};
+
+ctx_profile::ContextNode *
+createNode(std::vector<std::unique_ptr<char[]>> &Nodes,
+           const std::vector<DeserializableCtx> &DCList);
+
+// Convert a DeserializableCtx into a ContextNode, potentially linking it to
+// its sibling (e.g. callee at same callsite) "Next".
+ctx_profile::ContextNode *
+createNode(std::vector<std::unique_ptr<char[]>> &Nodes,
+           const DeserializableCtx &DC,
+           ctx_profile::ContextNode *Next = nullptr) {
+  auto AllocSize = ctx_profile::ContextNode::getAllocSize(DC.Counters.size(),
+                                                          DC.Callsites.size());
+  auto *Mem = Nodes.emplace_back(std::make_unique<char[]>(AllocSize)).get();
+  std::memset(Mem, 0, AllocSize);
+  auto *Ret = new (Mem) ctx_profile::ContextNode(DC.Guid, DC.Counters.size(),
+                                                 DC.Callsites.size(), Next);
+  std::memcpy(Ret->counters(), DC.Counters.data(),
+              sizeof(uint64_t) * DC.Counters.size());
+  for (const auto &[I, DCList] : llvm::enumerate(DC.Callsites))
+    Ret->subContexts()[I] = createNode(Nodes, DCList);
+  return Ret;
+}
+
+// Convert a list of DeserializableCtx into a linked list of ContextNodes.
+ctx_profile::ContextNode *
+createNode(std::vector<std::unique_ptr<char[]>> &Nodes,
+           const std::vector<DeserializableCtx> &DCList) {
+  ctx_profile::ContextNode *List = nullptr;
+  for (const auto &DC : DCList)
+    List = createNode(Nodes, DC, List);
+  return List;
+}
+} // namespace
+
+namespace llvm {
+namespace json {
+// Hook into the JSON deserialization.
+bool fromJSON(const Value &E, DeserializableCtx &R, Path P) {
+  json::ObjectMapper Mapper(E, P);
+  return Mapper && Mapper.map("Guid", R.Guid) &&
+         Mapper.map("Counters", R.Counters) &&
+         Mapper.mapOptional("Callsites", R.Callsites);
+}
+} // namespace json
+} // namespace llvm
+
+// Save the bitstream profile from the JSON representation.
+Error convertFromJSON() {
+  auto BufOrError = MemoryBuffer::getFileOrSTDIN(InputFilename);
+  if (!BufOrError)
+    return createFileError(InputFilename, BufOrError.getError());
+  auto P = json::parse(BufOrError.get()->getBuffer());
+  if (!P)
+    return P.takeError();
+
+  std::vector<DeserializableCtx> DCList;
+  json::Path::Root R("");
+  if (!fromJSON(*P, DCList, R))
+    return R.getError();
+  // Nodes provides memory backing for the ContextualNodes.
+  std::vector<std::unique_ptr<char[]>> Nodes;
+  std::error_code EC;
+  raw_fd_stream Out(OutputFilename, EC);
+  if (EC)
+    return createStringError(EC, "failed to open output");
+  PGOCtxProfileWriter Writer(Out);
+  for (const auto &DC : DCList) {
+    auto *TopList = createNode(Nodes, DC);
+    if (!TopList)
+      return createStringError(
+          "Unexpected error converting internal structure to ctx profile");
+    Writer.write(*TopList);
+  }
+  if (EC)
+    return createStringError(EC, "failed to write output");
+  return Error::success();
+}
+
+int llvm_ctxprof_util_main(int argc, char **argv, const llvm::ToolContext &) {
+  cl::ParseCommandLineOptions(argc, argv, "LLVM Contextual Profile Utils\n");
+  ExitOnError ExitOnErr("llvm-ctxprof-util: ");
+  if (FromJSON) {
+    if (auto E = convertFromJSON()) {
+      handleAllErrors(std::move(E), [&](const ErrorInfoBase &E) {
+        E.log(errs());
+        errs() << "\n";
+      });
+      return 1;
+    }
+    return 0;
+  }
+  llvm_unreachable("Unknown subcommands should have been handled by the "
+                   "command line parser.");
+}

>From 684a5a30e1d23805b72988d3045092ed5d115c27 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me at m-sp.org>
Date: Thu, 25 Jul 2024 22:14:15 +0200
Subject: [PATCH 195/259] [mlir][Transforms] Dialect conversion: fix crash when
 converting detached region (#100633)

This commit fixes a crash in the dialect conversion when applying a
signature conversion to a block inside of a detached region.

This fixes an issue reported in
https://github.com/llvm/llvm-project/pull/97213/files/4114d5be87596e11d86706a338248ebf05cf7150#r1691809730.
---
 .../Transforms/Utils/DialectConversion.cpp    |  3 +-
 mlir/test/Transforms/test-legalizer.mlir      | 15 ++++++
 mlir/test/lib/Dialect/Test/TestPatterns.cpp   | 49 +++++++++++++++----
 3 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index a045868b66031..059288e18049b 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -1370,7 +1370,8 @@ Value ConversionPatternRewriterImpl::buildUnresolvedMaterialization(
 
   // Create an unresolved materialization. We use a new OpBuilder to avoid
   // tracking the materialization like we do for other operations.
-  OpBuilder builder(insertBlock, insertPt);
+  OpBuilder builder(outputType.getContext());
+  builder.setInsertionPoint(insertBlock, insertPt);
   auto convertOp =
       builder.create<UnrealizedConversionCastOp>(loc, outputType, inputs);
   appendRewrite<UnresolvedMaterializationRewrite>(convertOp, converter, kind);
diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir
index 65c947198e06e..7a7af2b772ed2 100644
--- a/mlir/test/Transforms/test-legalizer.mlir
+++ b/mlir/test/Transforms/test-legalizer.mlir
@@ -437,3 +437,18 @@ func.func @fold_legalization() -> i32 {
   %1 = "test.op_in_place_self_fold"() : () -> (i32)
   "test.return"(%1) : (i32) -> ()
 }
+
+// -----
+
+// CHECK-LABEL: func @convert_detached_signature()
+//       CHECK:   "test.legal_op_with_region"() ({
+//       CHECK:   ^bb0(%arg0: f64):
+//       CHECK:     "test.return"() : () -> ()
+//       CHECK:   }) : () -> ()
+func.func @convert_detached_signature() {
+  "test.detached_signature_conversion"() ({
+  ^bb0(%arg0: i64):
+    "test.return"() : () -> ()
+  }) : () -> ()
+  "test.return"() : () -> ()
+}
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index a14a5da341098..0546523a58c80 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -685,6 +685,33 @@ namespace {
 //===----------------------------------------------------------------------===//
 // Region-Block Rewrite Testing
 
+/// This pattern applies a signature conversion to a block inside a detached
+/// region.
+struct TestDetachedSignatureConversion : public ConversionPattern {
+  TestDetachedSignatureConversion(MLIRContext *ctx)
+      : ConversionPattern("test.detached_signature_conversion", /*benefit=*/1,
+                          ctx) {}
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    if (op->getNumRegions() != 1)
+      return failure();
+    OperationState state(op->getLoc(), "test.legal_op_with_region", operands,
+                         op->getResultTypes(), {}, BlockRange());
+    Region *newRegion = state.addRegion();
+    rewriter.inlineRegionBefore(op->getRegion(0), *newRegion,
+                                newRegion->begin());
+    TypeConverter::SignatureConversion result(newRegion->getNumArguments());
+    for (unsigned i = 0, e = newRegion->getNumArguments(); i < e; ++i)
+      result.addInputs(i, rewriter.getF64Type());
+    rewriter.applySignatureConversion(&newRegion->front(), result);
+    Operation *newOp = rewriter.create(state);
+    rewriter.replaceOp(op, newOp->getResults());
+    return success();
+  }
+};
+
 /// This pattern is a simple pattern that inlines the first region of a given
 /// operation into the parent region.
 struct TestRegionRewriteBlockMovement : public ConversionPattern {
@@ -1112,16 +1139,16 @@ struct TestLegalizePatternDriver
     TestTypeConverter converter;
     mlir::RewritePatternSet patterns(&getContext());
     populateWithGenerated(patterns);
-    patterns
-        .add<TestRegionRewriteBlockMovement, TestRegionRewriteUndo,
-             TestCreateBlock, TestCreateIllegalBlock, TestUndoBlockArgReplace,
-             TestUndoBlockErase, TestPassthroughInvalidOp, TestSplitReturnType,
-             TestChangeProducerTypeI32ToF32, TestChangeProducerTypeF32ToF64,
-             TestChangeProducerTypeF32ToInvalid, TestUpdateConsumerType,
-             TestNonRootReplacement, TestBoundedRecursiveRewrite,
-             TestNestedOpCreationUndoRewrite, TestReplaceEraseOp,
-             TestCreateUnregisteredOp, TestUndoMoveOpBefore,
-             TestUndoPropertiesModification>(&getContext());
+    patterns.add<
+        TestRegionRewriteBlockMovement, TestDetachedSignatureConversion,
+        TestRegionRewriteUndo, TestCreateBlock, TestCreateIllegalBlock,
+        TestUndoBlockArgReplace, TestUndoBlockErase, TestPassthroughInvalidOp,
+        TestSplitReturnType, TestChangeProducerTypeI32ToF32,
+        TestChangeProducerTypeF32ToF64, TestChangeProducerTypeF32ToInvalid,
+        TestUpdateConsumerType, TestNonRootReplacement,
+        TestBoundedRecursiveRewrite, TestNestedOpCreationUndoRewrite,
+        TestReplaceEraseOp, TestCreateUnregisteredOp, TestUndoMoveOpBefore,
+        TestUndoPropertiesModification>(&getContext());
     patterns.add<TestDropOpSignatureConversion>(&getContext(), converter);
     mlir::populateAnyFunctionOpInterfaceTypeConversionPattern(patterns,
                                                               converter);
@@ -1132,6 +1159,8 @@ struct TestLegalizePatternDriver
     target.addLegalOp<ModuleOp>();
     target.addLegalOp<LegalOpA, LegalOpB, LegalOpC, TestCastOp, TestValidOp,
                       TerminatorOp, OneRegionOp>();
+    target.addLegalOp(
+        OperationName("test.legal_op_with_region", &getContext()));
     target
         .addIllegalOp<ILLegalOpF, TestRegionBuilderOp, TestOpWithRegionFold>();
     target.addDynamicallyLegalOp<TestReturnOp>([](TestReturnOp op) {

>From 6d163613621faec5050f49b9cb9da15931359090 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Thu, 25 Jul 2024 13:16:19 -0700
Subject: [PATCH 196/259] [SDAG] Manual jumpthread in findMemType for style
 clarity [NFC]

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 92b62ccdc2755..5672b611234b8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -7299,14 +7299,14 @@ static std::optional<EVT> findMemType(SelectionDAG &DAG,
   unsigned WidenEltWidth = WidenEltVT.getSizeInBits();
   unsigned AlignInBits = Align*8;
 
-  // If we have one element to load/store, return it.
   EVT RetVT = WidenEltVT;
-  if (!Scalable && Width == WidenEltWidth)
-    return RetVT;
-
   // Don't bother looking for an integer type if the vector is scalable, skip
   // to vector types.
   if (!Scalable) {
+    // If we have one element to load/store, return it.
+    if (Width == WidenEltWidth)
+      return RetVT;
+
     // See if there is larger legal integer than the element type to load/store.
     for (EVT MemVT : reverse(MVT::integer_valuetypes())) {
       unsigned MemVTWidth = MemVT.getSizeInBits();

>From 888ae9066da9c9758c03e839be6cefb3246a9b1c Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 25 Jul 2024 15:25:06 -0500
Subject: [PATCH 197/259] [libc] Fix leftoever debug in 'f16fma' function
 (#100638)

Summary:
This caused issues on NVPTX
---
 libc/src/math/generic/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 7280add7c8bb0..53155c6bbd84b 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -4231,7 +4231,7 @@ add_entrypoint_object(
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.fma
   COMPILE_OPTIONS
-    -O0 -ggdb3
+    -O3
 )
 
 add_entrypoint_object(

>From 31d289440b30365e0263fbeb8ce919fb4f35b636 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Thu, 25 Jul 2024 16:29:08 -0400
Subject: [PATCH 198/259] Restrict llvm-ctxprof-util tests to linux (#100650)

Follow up from PR #100379, this is to avoid output formatting
differences.
---
 .../tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test     | 2 ++
 llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test        | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test
index 97598c11979fa..08c83c9f907fb 100644
--- a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test
+++ b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test
@@ -1,3 +1,5 @@
+; REQUIRES: x86_64-linux
+
 ; RUN: not llvm-ctxprof-util nofile.json 2>&1 | FileCheck %s --check-prefix=NO_CMD
 ; RUN: not llvm-ctxprof-util invalidCmd --input nofile.json 2>&1 | FileCheck %s --check-prefix=INVALID_CMD
 ; RUN: not llvm-ctxprof-util fromJSON nofile.json 2>&1 | FileCheck %s --check-prefix=NO_FLAG
diff --git a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test
index d430818d05442..5a21bffa59022 100644
--- a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test
+++ b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test
@@ -1,3 +1,5 @@
+; REQUIRES: x86_64-linux
+
 ; RUN: mkdir -p %t
 ; RUN: llvm-ctxprof-util fromJSON --input %S/Inputs/empty.json -output %t/empty.bitstream
 ; RUN: llvm-bcanalyzer --dump %t/empty.bitstream | FileCheck %s --check-prefix=EMPTY

>From 5909979869edca359bcbca74042c2939d900680e Mon Sep 17 00:00:00 2001
From: Tristan Ross <tristan.ross at midstall.com>
Date: Thu, 25 Jul 2024 13:29:38 -0700
Subject: [PATCH 199/259] [BOLT] Fix archive output directory for standalone on
 Mac (#100643)

CC @gulfemsavrun

Fixes a line which wasn't changed in #97130
---
 bolt/runtime/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt
index 948825ddc27eb..40f4fbc9f30d5 100644
--- a/bolt/runtime/CMakeLists.txt
+++ b/bolt/runtime/CMakeLists.txt
@@ -61,7 +61,7 @@ if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Da
     instr.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/config.h
   )
-  set_target_properties(bolt_rt_instr_osx PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${LLVM_LIBRARY_DIR}")
+  set_target_properties(bolt_rt_instr_osx PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
   target_include_directories(bolt_rt_instr_osx PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
   target_compile_options(bolt_rt_instr_osx PRIVATE
     -target x86_64-apple-darwin19.6.0

>From c1d1a752cea105dba1aa999bc90feb5faa974bec Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail at bennani.ma>
Date: Thu, 25 Jul 2024 13:31:54 -0700
Subject: [PATCH 200/259] [lldb/Plugins] Fix build failure on windows following
 2914a4b88837

This patch tries to fix the following build failure on windows:

https://lab.llvm.org/buildbot/#/builders/141/builds/1083

This started happening following 2914a4b88837, and it seems to be caused
by some special `#include` ordering for the lldb-python header on Windows.

Signed-off-by: Med Ismail Bennani <ismail at bennani.ma>
---
 .../Python/Interfaces/ScriptedPythonInterface.h   |  8 ++++++++
 .../Interfaces/ScriptedThreadPythonInterface.h    |  7 +++++++
 .../ScriptInterpreter/Python/PythonDataObjects.h  |  9 +++++++--
 .../ScriptInterpreter/Python/SWIGPythonBridge.h   | 15 ++++++++++-----
 .../Python/ScriptInterpreterPython.h              |  8 ++++++++
 .../Python/ScriptInterpreterPythonImpl.h          | 10 ++++++++--
 6 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
index e1a3156d10afd..74b89a6f6a883 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
@@ -10,6 +10,12 @@
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDPYTHONINTERFACE_H
 
 #if LLDB_ENABLE_PYTHON
+// clang-format off
+// LLDB Python header must be included first
+#include "../lldb-python.h"
+//clang-format on
+#endif
+
 
 #include <optional>
 #include <sstream>
@@ -21,6 +27,8 @@
 #include "lldb/Interpreter/Interfaces/ScriptedInterface.h"
 #include "lldb/Utility/DataBufferHeap.h"
 
+#if LLDB_ENABLE_PYTHON
+
 #include "../PythonDataObjects.h"
 #include "../SWIGPythonBridge.h"
 #include "../ScriptInterpreterPythonImpl.h"
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.h
index 5676f7f1d6752..7724f9128c514 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.h
@@ -9,6 +9,13 @@
 #ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPYTHONINTERFACE_H
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPYTHONINTERFACE_H
 
+#if LLDB_ENABLE_PYTHON
+// clang-format off
+// LLDB Python header must be included first
+#include "../lldb-python.h"
+//clang-format on
+#endif
+
 #include "lldb/Host/Config.h"
 
 #if LLDB_ENABLE_PYTHON
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.h b/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.h
index 88c1bb7e729e7..f5fb605c2bf3b 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.h
@@ -48,12 +48,17 @@
 #ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_PYTHONDATAOBJECTS_H
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_PYTHONDATAOBJECTS_H
 
-#include "lldb/Host/Config.h"
-
 #if LLDB_ENABLE_PYTHON
 
+// clang-format off
 // LLDB Python header must be included first
 #include "lldb-python.h"
+//clang-format on
+#endif
+
+#include "lldb/Host/Config.h"
+
+#if LLDB_ENABLE_PYTHON
 
 #include "lldb/Host/File.h"
 #include "lldb/Utility/StructuredData.h"
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
index 3026b6113ae8f..efe173aeea8ea 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
@@ -9,15 +9,20 @@
 #ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_SWIGPYTHONBRIDGE_H
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_SWIGPYTHONBRIDGE_H
 
-#include <optional>
-#include <string>
-
-#include "lldb/Host/Config.h"
-
 #if LLDB_ENABLE_PYTHON
 
+// clang-format off
 // LLDB Python header must be included first
 #include "lldb-python.h"
+//clang-format on
+#endif
+
+#include "lldb/Host/Config.h"
+
+#include <optional>
+#include <string>
+
+#if LLDB_ENABLE_PYTHON
 
 #include "Plugins/ScriptInterpreter/Python/PythonDataObjects.h"
 #include "lldb/lldb-forward.h"
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.h
index 2e8301a85eb6c..c54530e2a5d8f 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.h
@@ -9,6 +9,14 @@
 #ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_SCRIPTINTERPRETERPYTHON_H
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_SCRIPTINTERPRETERPYTHON_H
 
+#if LLDB_ENABLE_PYTHON
+
+// clang-format off
+// LLDB Python header must be included first
+#include "lldb-python.h"
+//clang-format on
+#endif
+
 #include "lldb/Host/Config.h"
 
 #if LLDB_ENABLE_PYTHON
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
index c2024efb395d7..99d5b7ae5553b 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
@@ -9,11 +9,17 @@
 #ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_SCRIPTINTERPRETERPYTHONIMPL_H
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_SCRIPTINTERPRETERPYTHONIMPL_H
 
-#include "lldb/Host/Config.h"
-
 #if LLDB_ENABLE_PYTHON
 
+// clang-format off
+// LLDB Python header must be included first
 #include "lldb-python.h"
+//clang-format on
+#endif
+
+#include "lldb/Host/Config.h"
+
+#if LLDB_ENABLE_PYTHON
 
 #include "PythonDataObjects.h"
 #include "ScriptInterpreterPython.h"

>From a3092152ac486f440376c732a5503b5c60d13183 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 25 Jul 2024 21:34:54 +0100
Subject: [PATCH 201/259] [VPlan] Don't create live-outs for induction
 increments.

Follow up to fc9cd3272b5 to also skip creating live-outs for IV
increments, as those are also generated independent of VPlan for now.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 19 ++++++++++++-------
 llvm/lib/Transforms/Vectorize/VPlan.h         |  5 -----
 .../Transforms/LoopVectorize/X86/pr39160.ll   |  4 +++-
 .../version-stride-with-integer-casts.ll      |  3 ---
 4 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cab892146306c..09ca859f52680 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2822,10 +2822,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
     // In this case, if IV1 has an external use, we need to avoid adding both
     // "last value of IV1" and "penultimate value of IV2". So, verify that we
     // don't already have an incoming value for the middle block.
-    if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
+    if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
       PHI->addIncoming(I.second, MiddleBlock);
-      Plan.removeLiveOut(PHI);
-    }
   }
 }
 
@@ -8427,8 +8425,9 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 
 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
 // original exit block.
-static void addUsersInExitBlock(Loop *OrigLoop, VPRecipeBuilder &Builder,
-                                VPlan &Plan) {
+static void addUsersInExitBlock(
+    Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
+    const MapVector<PHINode *, InductionDescriptor> &Inductions) {
   BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
   // Only handle single-exit loops with unique exit blocks for now.
@@ -8446,7 +8445,12 @@ static void addUsersInExitBlock(Loop *OrigLoop, VPRecipeBuilder &Builder,
     // live-outs.
     if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
          !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
-        isa<VPWidenPointerInductionRecipe>(V))
+        isa<VPWidenPointerInductionRecipe>(V) ||
+        (isa<Instruction>(IncomingValue) &&
+         any_of(IncomingValue->users(), [&Inductions](User *U) {
+           auto *P = dyn_cast<PHINode>(U);
+           return P && Inductions.contains(P);
+         })))
       continue;
     Plan.addLiveOut(&ExitPhi, V);
   }
@@ -8661,7 +8665,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     // and there is nothing to fix from vector loop; phis should have incoming
     // from scalar loop only.
   } else
-    addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan);
+    addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan,
+                        Legal->getInductionVars());
 
   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0b596e7e4f633..e1d18d5d6ff79 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3471,11 +3471,6 @@ class VPlan {
 
   void addLiveOut(PHINode *PN, VPValue *V);
 
-  void removeLiveOut(PHINode *PN) {
-    delete LiveOuts[PN];
-    LiveOuts.erase(PN);
-  }
-
   const MapVector<PHINode *, VPLiveOut *> &getLiveOuts() const {
     return LiveOuts;
   }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr39160.ll b/llvm/test/Transforms/LoopVectorize/X86/pr39160.ll
index ba777f52f532b..89821dd98c520 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr39160.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr39160.ll
@@ -4,7 +4,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Make sure that we can compile the test without crash.
-define void @barney() {
+define void @barney(ptr %dst) {
 
 ; CHECK-LABEL: @barney(
 ; CHECK:       middle.block:
@@ -43,6 +43,8 @@ bb48:                                             ; preds = %bb46
 bb50:                                             ; preds = %bb50, %bb19
   %tmp52 = phi i32 [ %tmp55, %bb50 ], [ %tmp22, %bb19 ]
   %tmp53 = phi i64 [ %tmp56, %bb50 ], [ 1, %bb19 ]
+  %gep = getelementptr inbounds i8, ptr %dst, i64 %tmp53
+  store i8 1, ptr %gep
   %tmp54 = add i32 %tmp52, 12
   %tmp55 = add i32 %tmp52, 13
   %tmp56 = add nuw nsw i64 %tmp53, 1
diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
index 45596169da3cc..f53c7b2fc6bae 100644
--- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
@@ -34,7 +34,6 @@ define void @test_versioned_with_sext_use(i32 %offset, ptr %dst) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP3]], 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -110,7 +109,6 @@ define void @test_versioned_with_zext_use(i32 %offset, ptr %dst) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP3]], 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -267,7 +265,6 @@ define void @test_versioned_with_different_uses(i32 %offset, ptr noalias %dst.1,
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[DST_2]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]

>From 0c10bdc05f0f90ae173f29854f03796e92ed76e4 Mon Sep 17 00:00:00 2001
From: Daniel Thornburgh <dthorn at google.com>
Date: Thu, 25 Jul 2024 13:38:06 -0700
Subject: [PATCH 202/259] [libc] Lazily initialize freelist malloc using
 symbols (#99254)

This requires the user to set the upper bounds of the heap by defining
the symbol `__libc_heap_limit`. The heap begins at `_end` and ends
`__libc_heap_limit` bytes afterwards. This prevents a completely unused
heap from requiring any space, and it prevents the heap from being
zeroed at initialization time as part of BSS. It also allows users to
customize the available heap location without recompiling libc.

I'd think this should eventually be replaced with an implemenation based
on a morecore() library. This would allow the same implementation to use
sbrk() on POSIX, `_end` and `__libc_heap_limit` on embedded, and a
buffer in tests. It would also provide better "wilderness" behavior that
tends to decrease heap fragementation (see Wilson et al.)

See #98096
---
 libc/config/baremetal/config.json             |  5 --
 libc/config/config.json                       |  6 --
 libc/docs/configure.rst                       |  2 -
 libc/src/__support/freelist_heap.h            | 73 ++++++++-----------
 libc/src/stdlib/CMakeLists.txt                |  2 -
 libc/src/stdlib/freelist_malloc.cpp           | 13 +---
 libc/test/src/__support/CMakeLists.txt        |  1 +
 libc/test/src/__support/fake_heap.s           | 15 ++++
 .../test/src/__support/freelist_heap_test.cpp |  7 +-
 .../src/__support/freelist_malloc_test.cpp    |  4 +-
 10 files changed, 57 insertions(+), 71 deletions(-)
 create mode 100644 libc/test/src/__support/fake_heap.s

diff --git a/libc/config/baremetal/config.json b/libc/config/baremetal/config.json
index 12f4c2aa3a805..f0ff268b56275 100644
--- a/libc/config/baremetal/config.json
+++ b/libc/config/baremetal/config.json
@@ -18,11 +18,6 @@
       "value": false
     }
   },
-  "malloc": {
-    "LIBC_CONF_FREELIST_MALLOC_BUFFER_SIZE": {
-      "value": 102400
-    }
-  },
   "qsort": {
     "LIBC_CONF_QSORT_IMPL": {
       "value": "LIBC_QSORT_HEAP_SORT"
diff --git a/libc/config/config.json b/libc/config/config.json
index 2bf432ecae342..3532925b745e7 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -71,12 +71,6 @@
       "doc": "Default number of spins before blocking if a rwlock is in contention (default to 100)."
     }
   },
-  "malloc": {
-    "LIBC_CONF_FREELIST_MALLOC_BUFFER_SIZE": {
-      "value": 1073741824,
-      "doc": "Default size for the constinit freelist buffer used for the freelist malloc implementation (default 1o 1GB)."
-    }
-  },
   "unistd": {
     "LIBC_CONF_ENABLE_TID_CACHE": {
       "value": true,
diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst
index b81922367d8b7..1936c8791c129 100644
--- a/libc/docs/configure.rst
+++ b/libc/docs/configure.rst
@@ -30,8 +30,6 @@ to learn about the defaults for your platform and target.
     - ``LIBC_CONF_KEEP_FRAME_POINTER``: Keep frame pointer in functions for better debugging experience.
 * **"errno" options**
     - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM.
-* **"malloc" options**
-    - ``LIBC_CONF_FREELIST_MALLOC_BUFFER_SIZE``: Default size for the constinit freelist buffer used for the freelist malloc implementation (default 1o 1GB).
 * **"math" options**
     - ``LIBC_CONF_MATH_OPTIMIZATIONS``: Configures optimizations for math functions. Values accepted are LIBC_MATH_SKIP_ACCURATE_PASS, LIBC_MATH_SMALL_TABLES, LIBC_MATH_NO_ERRNO, LIBC_MATH_NO_EXCEPT, and LIBC_MATH_FAST.
 * **"printf" options**
diff --git a/libc/src/__support/freelist_heap.h b/libc/src/__support/freelist_heap.h
index a2c714e15ba87..fed00d06716cf 100644
--- a/libc/src/__support/freelist_heap.h
+++ b/libc/src/__support/freelist_heap.h
@@ -22,6 +22,9 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
+extern "C" cpp::byte _end;
+extern "C" cpp::byte __llvm_libc_heap_limit;
+
 using cpp::optional;
 using cpp::span;
 
@@ -47,18 +50,10 @@ template <size_t NUM_BUCKETS = DEFAULT_BUCKETS.size()> class FreeListHeap {
     size_t total_free_calls;
   };
 
-  FreeListHeap(span<cpp::byte> region)
-      : FreeListHeap(&*region.begin(), &*region.end(), region.size()) {
-    auto result = BlockType::init(region);
-    BlockType *block = *result;
-    freelist_.add_chunk(block_to_span(block));
-  }
+  constexpr FreeListHeap() : begin_(&_end), end_(&__llvm_libc_heap_limit) {}
 
-  constexpr FreeListHeap(void *start, cpp::byte *end, size_t total_bytes)
-      : block_region_start_(start), block_region_end_(end),
-        freelist_(DEFAULT_BUCKETS), heap_stats_{} {
-    heap_stats_.total_bytes = total_bytes;
-  }
+  constexpr FreeListHeap(span<cpp::byte> region)
+      : begin_(region.begin()), end_(region.end()) {}
 
   void *allocate(size_t size);
   void *aligned_allocate(size_t alignment, size_t size);
@@ -69,61 +64,57 @@ template <size_t NUM_BUCKETS = DEFAULT_BUCKETS.size()> class FreeListHeap {
   void *calloc(size_t num, size_t size);
 
   const HeapStats &heap_stats() const { return heap_stats_; }
-  void reset_heap_stats() { heap_stats_ = {}; }
 
-  void *region_start() const { return block_region_start_; }
-  size_t region_size() const {
-    return reinterpret_cast<uintptr_t>(block_region_end_) -
-           reinterpret_cast<uintptr_t>(block_region_start_);
-  }
+  cpp::span<cpp::byte> region() const { return {begin_, end_}; }
 
-protected:
-  constexpr void set_freelist_node(typename FreeListType::FreeListNode &node,
-                                   cpp::span<cpp::byte> chunk) {
-    freelist_.set_freelist_node(node, chunk);
-  }
+private:
+  void init();
 
   void *allocate_impl(size_t alignment, size_t size);
 
-private:
   span<cpp::byte> block_to_span(BlockType *block) {
     return span<cpp::byte>(block->usable_space(), block->inner_size());
   }
 
-  bool is_valid_ptr(void *ptr) {
-    return ptr >= block_region_start_ && ptr < block_region_end_;
-  }
+  bool is_valid_ptr(void *ptr) { return ptr >= begin_ && ptr < end_; }
 
-  void *block_region_start_;
-  void *block_region_end_;
-  FreeListType freelist_;
-  HeapStats heap_stats_;
+  bool is_initialized_ = false;
+  cpp::byte *begin_;
+  cpp::byte *end_;
+  FreeListType freelist_{DEFAULT_BUCKETS};
+  HeapStats heap_stats_{};
 };
 
 template <size_t BUFF_SIZE, size_t NUM_BUCKETS = DEFAULT_BUCKETS.size()>
-struct FreeListHeapBuffer : public FreeListHeap<NUM_BUCKETS> {
+class FreeListHeapBuffer : public FreeListHeap<NUM_BUCKETS> {
   using parent = FreeListHeap<NUM_BUCKETS>;
   using FreeListNode = typename parent::FreeListType::FreeListNode;
 
+public:
   constexpr FreeListHeapBuffer()
-      : FreeListHeap<NUM_BUCKETS>(&block, buffer + sizeof(buffer), BUFF_SIZE),
-        block(0, BUFF_SIZE), node{}, buffer{} {
-    block.mark_last();
-
-    cpp::span<cpp::byte> chunk(buffer, sizeof(buffer));
-    parent::set_freelist_node(node, chunk);
-  }
+      : FreeListHeap<NUM_BUCKETS>{buffer}, buffer{} {}
 
-  typename parent::BlockType block;
-  FreeListNode node;
-  cpp::byte buffer[BUFF_SIZE - sizeof(block) - sizeof(node)];
+private:
+  cpp::byte buffer[BUFF_SIZE];
 };
 
+template <size_t NUM_BUCKETS> void FreeListHeap<NUM_BUCKETS>::init() {
+  LIBC_ASSERT(!is_initialized_ && "duplicate initialization");
+  heap_stats_.total_bytes = region().size();
+  auto result = BlockType::init(region());
+  BlockType *block = *result;
+  freelist_.add_chunk(block_to_span(block));
+  is_initialized_ = true;
+}
+
 template <size_t NUM_BUCKETS>
 void *FreeListHeap<NUM_BUCKETS>::allocate_impl(size_t alignment, size_t size) {
   if (size == 0)
     return nullptr;
 
+  if (!is_initialized_)
+    init();
+
   // Find a chunk in the freelist. Split it if needed, then return.
   auto chunk =
       freelist_.find_chunk_if([alignment, size](span<cpp::byte> chunk) {
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index 03d8c48751840..d79acb390ff91 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -392,8 +392,6 @@ if(NOT LIBC_TARGET_OS_IS_GPU)
         malloc.h
       DEPENDS
         libc.src.__support.freelist_heap
-      COMPILE_OPTIONS
-        -DLIBC_FREELIST_MALLOC_SIZE=${LIBC_CONF_FREELIST_MALLOC_BUFFER_SIZE}
     )
     get_target_property(freelist_malloc_is_skipped libc.src.stdlib.freelist_malloc "SKIPPED")
     if(LIBC_TARGET_OS_IS_BAREMETAL AND NOT freelist_malloc_is_skipped)
diff --git a/libc/src/stdlib/freelist_malloc.cpp b/libc/src/stdlib/freelist_malloc.cpp
index cfffa0425ff66..47240bc53aa37 100644
--- a/libc/src/stdlib/freelist_malloc.cpp
+++ b/libc/src/stdlib/freelist_malloc.cpp
@@ -18,17 +18,8 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-namespace {
-#ifdef LIBC_FREELIST_MALLOC_SIZE
-// This is set via the LIBC_CONF_FREELIST_MALLOC_BUFFER_SIZE configuration.
-constexpr size_t SIZE = LIBC_FREELIST_MALLOC_SIZE;
-#else
-#error "LIBC_FREELIST_MALLOC_SIZE was not defined for this build."
-#endif
-LIBC_CONSTINIT FreeListHeapBuffer<SIZE> freelist_heap_buffer;
-} // namespace
-
-FreeListHeap<> *freelist_heap = &freelist_heap_buffer;
+static LIBC_CONSTINIT FreeListHeap<> freelist_heap_symbols;
+FreeListHeap<> *freelist_heap = &freelist_heap_symbols;
 
 LLVM_LIBC_FUNCTION(void *, malloc, (size_t size)) {
   return freelist_heap->allocate(size);
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index f994f65e08cbb..90de520405981 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -34,6 +34,7 @@ if(LLVM_LIBC_FULL_BUILD)
     SUITE
       libc-support-tests
     SRCS
+      fake_heap.s
       freelist_heap_test.cpp
       freelist_malloc_test.cpp
     DEPENDS
diff --git a/libc/test/src/__support/fake_heap.s b/libc/test/src/__support/fake_heap.s
new file mode 100644
index 0000000000000..69522f53c8b1f
--- /dev/null
+++ b/libc/test/src/__support/fake_heap.s
@@ -0,0 +1,15 @@
+//===-- Test fake definition for heap symbols -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+.globl _end, __llvm_libc_heap_limit
+
+.bss
+_end:
+.fill 1024
+__llvm_libc_heap_limit:
+
diff --git a/libc/test/src/__support/freelist_heap_test.cpp b/libc/test/src/__support/freelist_heap_test.cpp
index 5815d5dfc01ab..fc4348aed6b56 100644
--- a/libc/test/src/__support/freelist_heap_test.cpp
+++ b/libc/test/src/__support/freelist_heap_test.cpp
@@ -30,6 +30,11 @@ using LIBC_NAMESPACE::freelist_heap;
 #define TEST_FOR_EACH_ALLOCATOR(TestCase, BufferSize)                          \
   class LlvmLibcFreeListHeapTest##TestCase : public testing::Test {            \
   public:                                                                      \
+    FreeListHeapBuffer<BufferSize> fake_global_buffer;                         \
+    void SetUp() override {                                                    \
+      freelist_heap =                                                          \
+          new (&fake_global_buffer) FreeListHeapBuffer<BufferSize>;            \
+    }                                                                          \
     void RunTest(FreeListHeap<> &allocator, [[maybe_unused]] size_t N);        \
   };                                                                           \
   TEST_F(LlvmLibcFreeListHeapTest##TestCase, TestCase) {                       \
@@ -37,7 +42,7 @@ using LIBC_NAMESPACE::freelist_heap;
         cpp::byte buf[BufferSize] = {cpp::byte(0)};                            \
     FreeListHeap<> allocator(buf);                                             \
     RunTest(allocator, BufferSize);                                            \
-    RunTest(*freelist_heap, freelist_heap->region_size());                     \
+    RunTest(*freelist_heap, freelist_heap->region().size());                   \
   }                                                                            \
   void LlvmLibcFreeListHeapTest##TestCase::RunTest(FreeListHeap<> &allocator,  \
                                                    size_t N)
diff --git a/libc/test/src/__support/freelist_malloc_test.cpp b/libc/test/src/__support/freelist_malloc_test.cpp
index e9d7c63a4d438..66a923f778218 100644
--- a/libc/test/src/__support/freelist_malloc_test.cpp
+++ b/libc/test/src/__support/freelist_malloc_test.cpp
@@ -14,15 +14,13 @@
 #include "test/UnitTest/Test.h"
 
 using LIBC_NAMESPACE::freelist_heap;
+using LIBC_NAMESPACE::FreeListHeapBuffer;
 
 TEST(LlvmLibcFreeListMalloc, MallocStats) {
   constexpr size_t kAllocSize = 256;
   constexpr size_t kCallocNum = 4;
   constexpr size_t kCallocSize = 64;
 
-  freelist_heap->reset_heap_stats(); // Do this because other tests might've
-                                     // called the same global allocator.
-
   void *ptr1 = LIBC_NAMESPACE::malloc(kAllocSize);
 
   const auto &freelist_heap_stats = freelist_heap->heap_stats();

>From cb7d4a187a3ec1814f42612997e94acd460ddabf Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz at quicinc.com>
Date: Thu, 25 Jul 2024 13:39:57 -0700
Subject: [PATCH 203/259] [MLIR][Vector] Add a test of <2x[4]xf32> type on
 lowering multireduction (#100451)

reduction on scalable dim:
vector.mask %C { vector.multi_reduction <add>, %A, %B [1] : vector<2x[4]xf32> to vector<2xf32> } : vector<2x[4]xi1> -> vector<2xf32>
---
 .../vector-multi-reduction-lowering.mlir      | 32 ++++++++++++++++---
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir b/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir
index f70d23a193229..6e93923608cbf 100644
--- a/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir
@@ -249,11 +249,11 @@ func.func @vector_multi_reduction_parallel_middle(%arg0: vector<3x4x5xf32>, %acc
 //  CHECK-SAME:   %[[INPUT:.+]]: vector<3x4x5xf32>, %[[ACC:.+]]: vector<4xf32>
 //       CHECK: vector.transpose %[[INPUT]], [1, 0, 2] : vector<3x4x5xf32> to vector<4x3x5xf32>
 
-func.func private @scalable_dims(%A : vector<8x[4]x2xf32>, %B: vector<8x[4]xf32>) -> vector<8x[4]xf32> {
+func.func private @vector_multi_reduction_non_scalable_dim(%A : vector<8x[4]x2xf32>, %B: vector<8x[4]xf32>) -> vector<8x[4]xf32> {
   %0 = vector.multi_reduction <add>, %A, %B [2] : vector<8x[4]x2xf32> to vector<8x[4]xf32>
   return %0 : vector<8x[4]xf32>
 }
-// CHECK-LABEL:   func.func private @scalable_dims(
+// CHECK-LABEL:   func.func private @vector_multi_reduction_non_scalable_dim(
 // CHECK-SAME:                                     %[[VAL_0:.*]]: vector<8x[4]x2xf32>,
 // CHECK-SAME:                                     %[[VAL_1:.*]]: vector<8x[4]xf32>) -> vector<8x[4]xf32> {
 // CHECK-DAG:       %[[VAL_2:.*]] = arith.constant dense<0.000000e+00> : vector<[32]xf32>
@@ -282,12 +282,12 @@ func.func private @scalable_dims(%A : vector<8x[4]x2xf32>, %B: vector<8x[4]xf32>
 // CHECK:           return %[[VAL_163]] : vector<8x[4]xf32>
 
 // Check that OneDimMultiReductionToTwoDim handles scalable dim
-func.func @scalable_dim_1d(%A: vector<[4]xf32>, %B: f32, %C: vector<[4]xi1>) -> f32 {
+func.func @vector_multi_reduction_scalable_dim_1d(%A: vector<[4]xf32>, %B: f32, %C: vector<[4]xi1>) -> f32 {
     %0 = vector.mask %C { vector.multi_reduction <add>, %A, %B [0] : vector<[4]xf32> to f32 } : vector<[4]xi1> -> f32
     return %0 : f32
 }
 
-// CHECK-LABEL:  func.func @scalable_dim_1d(
+// CHECK-LABEL:  func.func @vector_multi_reduction_scalable_dim_1d(
 // CHECK-SAME:                                      %[[ARG_0:.*]]: vector<[4]xf32>,
 // CHECK-SAME:                                      %[[ARG_1:.*]]: f32,
 // CHECK-SAME:                                      %[[ARG_2:.*]]: vector<[4]xi1>) -> f32 {
@@ -298,6 +298,30 @@ func.func @scalable_dim_1d(%A: vector<[4]xf32>, %B: f32, %C: vector<[4]xi1>) ->
 // CHECK:          %[[VAL_4:.*]] = vector.extract %[[VAL_3]][0] : f32 from vector<1xf32>
 // CHECK:          return %[[VAL_4]] : f32
 
+func.func @vector_multi_reduction_scalable_dim_2d(%A: vector<2x[4]xf32>, %B: vector<2xf32>, %C: vector<2x[4]xi1>) -> vector<2xf32> {
+    %0 = vector.mask %C { vector.multi_reduction <add>, %A, %B [1] : vector<2x[4]xf32> to vector<2xf32> } : vector<2x[4]xi1> -> vector<2xf32>
+    return %0 : vector<2xf32>
+}
+
+// CHECK-LABEL:  func.func @vector_multi_reduction_scalable_dim_2d(
+// CHECK-SAME:                                      %[[ARG_0:.*]]: vector<2x[4]xf32>,
+// CHECK-SAME:                                      %[[ARG_1:.*]]: vector<2xf32>,
+// CHECK-SAME:                                      %[[ARG_2:.*]]: vector<2x[4]xi1>) -> vector<2xf32> {
+// CHECK-DAG:      %[[C1_idx:.*]] = arith.constant 1 : index
+// CHECK-DAG:      %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK-DAG:      %[[C0_2xf32:.*]] = arith.constant dense<0.000000e+00> : vector<2xf32>
+// CHECK:          %[[ARG0_0:.*]] = vector.extract %[[ARG_0]][0] : vector<[4]xf32> from vector<2x[4]xf32>
+// CHECK:          %[[ARG1_0:.*]] = vector.extract %[[ARG_1]][0] : f32 from vector<2xf32>
+// CHECK:          %[[ARG2_0:.*]] = vector.extract %[[ARG_2]][0] : vector<[4]xi1> from vector<2x[4]xi1>
+// CHECK:          %[[REDUCE_0:.*]] = vector.mask %[[ARG2_0]] { vector.reduction <add>, %[[ARG0_0]], %[[ARG1_0]] : vector<[4]xf32> into f32 } : vector<[4]xi1> -> f32
+// CHECK:          %[[INSERT_0:.*]] = vector.insertelement %[[REDUCE_0]], %[[C0_2xf32]][%[[C0_idx]] : index] : vector<2xf32>
+// CHECK:          %[[ARG0_1:.*]] = vector.extract %[[ARG_0]][1] : vector<[4]xf32> from vector<2x[4]xf32>
+// CHECK:          %[[ARG1_1:.*]] = vector.extract %[[ARG_1]][1] : f32 from vector<2xf32>
+// CHECK:          %[[ARG2_1:.*]] = vector.extract %[[ARG_2]][1] : vector<[4]xi1> from vector<2x[4]xi1>
+// CHECK:          %[[REDUCE_1:.*]] = vector.mask %[[ARG2_1]] { vector.reduction <add>, %[[ARG0_1]], %[[ARG1_1]] : vector<[4]xf32> into f32 } : vector<[4]xi1> -> f32
+// CHECK:          %[[INSERT_1:.*]] = vector.insertelement %[[REDUCE_1]], %[[INSERT_0]][%[[C1_idx]] : index] : vector<2xf32>
+// CHECK:          return %[[INSERT_1]] : vector<2xf32>
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
     %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">

>From 01a489133e5de203c9b20981ad24a94b73e51aea Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 26 Jul 2024 00:45:10 +0400
Subject: [PATCH 204/259] AMDGPU: Add baseline test for vectorize of integer
 min/max (#100513)

---
 .../SLPVectorizer/AMDGPU/min_max.ll           | 354 ++++++++++++++++++
 1 file changed, 354 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
new file mode 100644
index 0000000000000..f44705a925d5a
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
@@ -0,0 +1,354 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s
+
+define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
+; GFX7-LABEL: @uadd_sat_v2i16(
+; GFX7-NEXT:  bb:
+; GFX7-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX7-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX7-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
+;
+; GFX8-LABEL: @uadd_sat_v2i16(
+; GFX8-NEXT:  bb:
+; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+;
+; GFX9-LABEL: @uadd_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i16> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i16> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i16> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i16> %arg1, i64 1
+  %add.0 = call i16 @llvm.umin.i16(i16 %arg0.0, i16 %arg1.0)
+  %add.1 = call i16 @llvm.umin.i16(i16 %arg0.1, i16 %arg1.1)
+  %ins.0 = insertelement <2 x i16> undef, i16 %add.0, i64 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 %add.1, i64 1
+  ret <2 x i16> %ins.1
+}
+
+define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
+; GFX7-LABEL: @usub_sat_v2i16(
+; GFX7-NEXT:  bb:
+; GFX7-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX7-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX7-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
+;
+; GFX8-LABEL: @usub_sat_v2i16(
+; GFX8-NEXT:  bb:
+; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+;
+; GFX9-LABEL: @usub_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i16> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i16> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i16> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i16> %arg1, i64 1
+  %add.0 = call i16 @llvm.umax.i16(i16 %arg0.0, i16 %arg1.0)
+  %add.1 = call i16 @llvm.umax.i16(i16 %arg0.1, i16 %arg1.1)
+  %ins.0 = insertelement <2 x i16> undef, i16 %add.0, i64 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 %add.1, i64 1
+  ret <2 x i16> %ins.1
+}
+
+define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
+; GFX7-LABEL: @sadd_sat_v2i16(
+; GFX7-NEXT:  bb:
+; GFX7-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX7-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX7-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
+;
+; GFX8-LABEL: @sadd_sat_v2i16(
+; GFX8-NEXT:  bb:
+; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+;
+; GFX9-LABEL: @sadd_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i16> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i16> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i16> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i16> %arg1, i64 1
+  %add.0 = call i16 @llvm.smin.i16(i16 %arg0.0, i16 %arg1.0)
+  %add.1 = call i16 @llvm.smin.i16(i16 %arg0.1, i16 %arg1.1)
+  %ins.0 = insertelement <2 x i16> undef, i16 %add.0, i64 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 %add.1, i64 1
+  ret <2 x i16> %ins.1
+}
+
+define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
+; GFX7-LABEL: @ssub_sat_v2i16(
+; GFX7-NEXT:  bb:
+; GFX7-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX7-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX7-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
+;
+; GFX8-LABEL: @ssub_sat_v2i16(
+; GFX8-NEXT:  bb:
+; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.smax.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+;
+; GFX9-LABEL: @ssub_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.smax.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i16> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i16> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i16> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i16> %arg1, i64 1
+  %add.0 = call i16 @llvm.smax.i16(i16 %arg0.0, i16 %arg1.0)
+  %add.1 = call i16 @llvm.smax.i16(i16 %arg0.1, i16 %arg1.1)
+  %ins.0 = insertelement <2 x i16> undef, i16 %add.0, i64 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 %add.1, i64 1
+  ret <2 x i16> %ins.1
+}
+
+define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
+; GCN-LABEL: @uadd_sat_v2i32(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i32> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i32> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i32> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i32> %arg1, i64 1
+  %add.0 = call i32 @llvm.umin.i32(i32 %arg0.0, i32 %arg1.0)
+  %add.1 = call i32 @llvm.umin.i32(i32 %arg0.1, i32 %arg1.1)
+  %ins.0 = insertelement <2 x i32> undef, i32 %add.0, i64 0
+  %ins.1 = insertelement <2 x i32> %ins.0, i32 %add.1, i64 1
+  ret <2 x i32> %ins.1
+}
+
+define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
+; GCN-LABEL: @usub_sat_v2i32(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.umax.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.umax.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i32> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i32> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i32> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i32> %arg1, i64 1
+  %add.0 = call i32 @llvm.umax.i32(i32 %arg0.0, i32 %arg1.0)
+  %add.1 = call i32 @llvm.umax.i32(i32 %arg0.1, i32 %arg1.1)
+  %ins.0 = insertelement <2 x i32> undef, i32 %add.0, i64 0
+  %ins.1 = insertelement <2 x i32> %ins.0, i32 %add.1, i64 1
+  ret <2 x i32> %ins.1
+}
+
+define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
+; GCN-LABEL: @sadd_sat_v2i32(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.smin.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.smin.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i32> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i32> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i32> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i32> %arg1, i64 1
+  %add.0 = call i32 @llvm.smin.i32(i32 %arg0.0, i32 %arg1.0)
+  %add.1 = call i32 @llvm.smin.i32(i32 %arg0.1, i32 %arg1.1)
+  %ins.0 = insertelement <2 x i32> undef, i32 %add.0, i64 0
+  %ins.1 = insertelement <2 x i32> %ins.0, i32 %add.1, i64 1
+  ret <2 x i32> %ins.1
+}
+
+define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
+; GCN-LABEL: @ssub_sat_v2i32(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.smax.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.smax.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
+;
+bb:
+  %arg0.0 = extractelement <2 x i32> %arg0, i64 0
+  %arg0.1 = extractelement <2 x i32> %arg0, i64 1
+  %arg1.0 = extractelement <2 x i32> %arg1, i64 0
+  %arg1.1 = extractelement <2 x i32> %arg1, i64 1
+  %add.0 = call i32 @llvm.smax.i32(i32 %arg0.0, i32 %arg1.0)
+  %add.1 = call i32 @llvm.smax.i32(i32 %arg0.1, i32 %arg1.1)
+  %ins.0 = insertelement <2 x i32> undef, i32 %add.0, i64 0
+  %ins.1 = insertelement <2 x i32> %ins.0, i32 %add.1, i64 1
+  ret <2 x i32> %ins.1
+}
+
+define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
+; GFX7-LABEL: @uadd_sat_v3i16(
+; GFX7-NEXT:  bb:
+; GFX7-NEXT:    [[ARG0_0:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 0
+; GFX7-NEXT:    [[ARG0_1:%.*]] = extractelement <3 x i16> [[ARG0]], i64 1
+; GFX7-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0]], i64 2
+; GFX7-NEXT:    [[ARG1_0:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 0
+; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <3 x i16> [[ARG1]], i64 1
+; GFX7-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1]], i64 2
+; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX7-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX7-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
+; GFX7-NEXT:    ret <3 x i16> [[INS_2]]
+;
+; GFX8-LABEL: @uadd_sat_v3i16(
+; GFX8-NEXT:  bb:
+; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
+; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
+; GFX8-NEXT:    [[TMP0:%.*]] = call <3 x i16> @llvm.umin.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
+; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP0]], i16 [[ADD_2]], i64 2
+; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
+;
+; GFX9-LABEL: @uadd_sat_v3i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
+; GFX9-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
+; GFX9-NEXT:    [[TMP0:%.*]] = call <3 x i16> @llvm.umin.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
+; GFX9-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX9-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP0]], i16 [[ADD_2]], i64 2
+; GFX9-NEXT:    ret <3 x i16> [[INS_2]]
+;
+bb:
+  %arg0.0 = extractelement <3 x i16> %arg0, i64 0
+  %arg0.1 = extractelement <3 x i16> %arg0, i64 1
+  %arg0.2 = extractelement <3 x i16> %arg0, i64 2
+  %arg1.0 = extractelement <3 x i16> %arg1, i64 0
+  %arg1.1 = extractelement <3 x i16> %arg1, i64 1
+  %arg1.2 = extractelement <3 x i16> %arg1, i64 2
+  %add.0 = call i16 @llvm.umin.i16(i16 %arg0.0, i16 %arg1.0)
+  %add.1 = call i16 @llvm.umin.i16(i16 %arg0.1, i16 %arg1.1)
+  %add.2 = call i16 @llvm.umin.i16(i16 %arg0.2, i16 %arg1.2)
+  %ins.0 = insertelement <3 x i16> undef, i16 %add.0, i64 0
+  %ins.1 = insertelement <3 x i16> %ins.0, i16 %add.1, i64 1
+  %ins.2 = insertelement <3 x i16> %ins.1, i16 %add.2, i64 2
+  ret <3 x i16> %ins.2
+}
+
+define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
+; GFX7-LABEL: @uadd_sat_v4i16(
+; GFX7-NEXT:  bb:
+; GFX7-NEXT:    [[ARG0_0:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 0
+; GFX7-NEXT:    [[ARG0_1:%.*]] = extractelement <4 x i16> [[ARG0]], i64 1
+; GFX7-NEXT:    [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0]], i64 2
+; GFX7-NEXT:    [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3
+; GFX7-NEXT:    [[ARG1_0:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 0
+; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <4 x i16> [[ARG1]], i64 1
+; GFX7-NEXT:    [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1]], i64 2
+; GFX7-NEXT:    [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3
+; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX7-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX7-NEXT:    [[ADD_3:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX7-NEXT:    [[INS_2:%.*]] = insertelement <4 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
+; GFX7-NEXT:    [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
+; GFX7-NEXT:    ret <4 x i16> [[INS_3]]
+;
+; GFX8-LABEL: @uadd_sat_v4i16(
+; GFX8-NEXT:  bb:
+; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG1:%.*]])
+; GFX8-NEXT:    [[TMP1:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]])
+; GFX8-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
+;
+; GFX9-LABEL: @uadd_sat_v4i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    [[TMP1:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]])
+; GFX9-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; GFX9-NEXT:    ret <4 x i16> [[INS_31]]
+;
+bb:
+  %arg0.0 = extractelement <4 x i16> %arg0, i64 0
+  %arg0.1 = extractelement <4 x i16> %arg0, i64 1
+  %arg0.2 = extractelement <4 x i16> %arg0, i64 2
+  %arg0.3 = extractelement <4 x i16> %arg0, i64 3
+  %arg1.0 = extractelement <4 x i16> %arg1, i64 0
+  %arg1.1 = extractelement <4 x i16> %arg1, i64 1
+  %arg1.2 = extractelement <4 x i16> %arg1, i64 2
+  %arg1.3 = extractelement <4 x i16> %arg1, i64 3
+  %add.0 = call i16 @llvm.umin.i16(i16 %arg0.0, i16 %arg1.0)
+  %add.1 = call i16 @llvm.umin.i16(i16 %arg0.1, i16 %arg1.1)
+  %add.2 = call i16 @llvm.umin.i16(i16 %arg0.2, i16 %arg1.2)
+  %add.3 = call i16 @llvm.umin.i16(i16 %arg0.3, i16 %arg1.3)
+  %ins.0 = insertelement <4 x i16> undef, i16 %add.0, i64 0
+  %ins.1 = insertelement <4 x i16> %ins.0, i16 %add.1, i64 1
+  %ins.2 = insertelement <4 x i16> %ins.1, i16 %add.2, i64 2
+  %ins.3 = insertelement <4 x i16> %ins.2, i16 %add.3, i64 3
+  ret <4 x i16> %ins.3
+}

>From e0c14c05a3e3178164ca7368516c080b823ef484 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Thu, 25 Jul 2024 21:47:18 +0100
Subject: [PATCH 205/259] [AArch64][PhaseOrdering] Add a SLP lane-ordering
 phase-ordering test. NFC

---
 .../PhaseOrdering/AArch64/slpordering.ll      | 482 ++++++++++++++++++
 1 file changed, 482 insertions(+)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll

diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
new file mode 100644
index 0000000000000..0e14481e4ea0a
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
@@ -0,0 +1,482 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S -O3 < %s | FileCheck %s
+
+; Check unrolling / SLP vectorization where the order of lanes is important for
+; producing efficient shuffles.
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64"
+
+; Function Attrs: nounwind uwtable
+define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 noundef %ip2) #0 {
+; CHECK-LABEL: define range(i32 0, 65536) i32 @slpordering
+; CHECK-SAME: (ptr nocapture noundef readonly [[P1:%.*]], i32 noundef [[IP1:%.*]], ptr nocapture noundef readonly [[P2:%.*]], i32 noundef [[IP2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[IP1]] to i64
+; CHECK-NEXT:    [[IDX_EXT63:%.*]] = sext i32 [[IP2]] to i64
+; CHECK-NEXT:    [[RRRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4
+; CHECK-NEXT:    [[RRRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[RDD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[RDD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[RRRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 4
+; CHECK-NEXT:    [[RRRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[RDD_PTR]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[RDD_PTR64]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_1]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_1]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[RDD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[RDD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[RRRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 4
+; CHECK-NEXT:    [[RRRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i8>, ptr [[RDD_PTR_1]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_1]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_2]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_2]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[RDD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[RDD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[RRRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_2]], i64 4
+; CHECK-NEXT:    [[RRRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_2]], i64 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[RDD_PTR_2]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32>
+; CHECK-NEXT:    [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32>
+; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32>
+; CHECK-NEXT:    [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]]
+; CHECK-NEXT:    [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP28]]
+; CHECK-NEXT:    [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP46:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP48:%.*]] = add nsw <16 x i32> [[TMP45]], [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP44]], [[TMP46]]
+; CHECK-NEXT:    [[TMP50:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP53]]
+; CHECK-NEXT:    [[TMP55:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP52]]
+; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 2, i32 5, i32 6, i32 17, i32 18, i32 21, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 3, i32 4, i32 7, i32 16, i32 19, i32 20, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 3, i32 4, i32 7, i32 16, i32 19, i32 20, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 2, i32 5, i32 6, i32 17, i32 18, i32 21, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP60:%.*]] = sub nsw <16 x i32> [[TMP57]], [[TMP59]]
+; CHECK-NEXT:    [[TMP61:%.*]] = add nsw <16 x i32> [[TMP56]], [[TMP58]]
+; CHECK-NEXT:    [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP66:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP65]]
+; CHECK-NEXT:    [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP62]], [[TMP64]]
+; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7, i32 20, i32 16, i32 21, i32 17, i32 22, i32 18, i32 23, i32 19>
+; CHECK-NEXT:    [[TMP69:%.*]] = lshr <16 x i32> [[TMP68]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP70:%.*]] = and <16 x i32> [[TMP69]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP71:%.*]] = mul nuw <16 x i32> [[TMP70]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP72:%.*]] = add <16 x i32> [[TMP71]], [[TMP68]]
+; CHECK-NEXT:    [[TMP73:%.*]] = xor <16 x i32> [[TMP72]], [[TMP71]]
+; CHECK-NEXT:    [[TMP74:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP73]])
+; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP74]], 65535
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP74]], 16
+; CHECK-NEXT:    [[RDD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]]
+; CHECK-NEXT:    [[SHR120:%.*]] = lshr i32 [[RDD119]], 1
+; CHECK-NEXT:    ret i32 [[SHR120]]
+;
+entry:
+  %p1.addr = alloca ptr, align 8
+  %ip1.addr = alloca i32, align 4
+  %p2.addr = alloca ptr, align 8
+  %ip2.addr = alloca i32, align 4
+  %emp = alloca [4 x [4 x i32]], align 4
+  %r0 = alloca i32, align 4
+  %r1 = alloca i32, align 4
+  %r2 = alloca i32, align 4
+  %r3 = alloca i32, align 4
+  %sum = alloca i32, align 4
+  %i = alloca i32, align 4
+  %e0 = alloca i32, align 4
+  %e1 = alloca i32, align 4
+  %e2 = alloca i32, align 4
+  %e3 = alloca i32, align 4
+  %i65 = alloca i32, align 4
+  %e071 = alloca i32, align 4
+  %e179 = alloca i32, align 4
+  %e287 = alloca i32, align 4
+  %e395 = alloca i32, align 4
+  store ptr %p1, ptr %p1.addr, align 8, !tbaa !4
+  store i32 %ip1, ptr %ip1.addr, align 4, !tbaa !8
+  store ptr %p2, ptr %p2.addr, align 8, !tbaa !4
+  store i32 %ip2, ptr %ip2.addr, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 64, ptr %emp) #2
+  call void @llvm.lifetime.start.p0(i64 4, ptr %r0) #2
+  call void @llvm.lifetime.start.p0(i64 4, ptr %r1) #2
+  call void @llvm.lifetime.start.p0(i64 4, ptr %r2) #2
+  call void @llvm.lifetime.start.p0(i64 4, ptr %r3) #2
+  call void @llvm.lifetime.start.p0(i64 4, ptr %sum) #2
+  store i32 0, ptr %sum, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2
+  store i32 0, ptr %i, align 4, !tbaa !8
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4, !tbaa !8
+  %cmp = icmp slt i32 %0, 4
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %rrrayidx = getelementptr inbounds i8, ptr %1, i64 0
+  %2 = load i8, ptr %rrrayidx, align 1, !tbaa !10
+  %conv = zext i8 %2 to i32
+  %3 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %rrrayidx1 = getelementptr inbounds i8, ptr %3, i64 0
+  %4 = load i8, ptr %rrrayidx1, align 1, !tbaa !10
+  %conv2 = zext i8 %4 to i32
+  %sub = sub nsw i32 %conv, %conv2
+  %5 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %rrrayidx3 = getelementptr inbounds i8, ptr %5, i64 4
+  %6 = load i8, ptr %rrrayidx3, align 1, !tbaa !10
+  %conv4 = zext i8 %6 to i32
+  %7 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %rrrayidx5 = getelementptr inbounds i8, ptr %7, i64 4
+  %8 = load i8, ptr %rrrayidx5, align 1, !tbaa !10
+  %conv6 = zext i8 %8 to i32
+  %sub7 = sub nsw i32 %conv4, %conv6
+  %shl = shl i32 %sub7, 16
+  %rdd = add nsw i32 %sub, %shl
+  store i32 %rdd, ptr %r0, align 4, !tbaa !8
+  %9 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %rrrayidx8 = getelementptr inbounds i8, ptr %9, i64 1
+  %10 = load i8, ptr %rrrayidx8, align 1, !tbaa !10
+  %conv9 = zext i8 %10 to i32
+  %11 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %rrrayidx10 = getelementptr inbounds i8, ptr %11, i64 1
+  %12 = load i8, ptr %rrrayidx10, align 1, !tbaa !10
+  %conv11 = zext i8 %12 to i32
+  %sub12 = sub nsw i32 %conv9, %conv11
+  %13 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %rrrayidx13 = getelementptr inbounds i8, ptr %13, i64 5
+  %14 = load i8, ptr %rrrayidx13, align 1, !tbaa !10
+  %conv14 = zext i8 %14 to i32
+  %15 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %rrrayidx15 = getelementptr inbounds i8, ptr %15, i64 5
+  %16 = load i8, ptr %rrrayidx15, align 1, !tbaa !10
+  %conv16 = zext i8 %16 to i32
+  %sub17 = sub nsw i32 %conv14, %conv16
+  %shl18 = shl i32 %sub17, 16
+  %rdd19 = add nsw i32 %sub12, %shl18
+  store i32 %rdd19, ptr %r1, align 4, !tbaa !8
+  %17 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %rrrayidx20 = getelementptr inbounds i8, ptr %17, i64 2
+  %18 = load i8, ptr %rrrayidx20, align 1, !tbaa !10
+  %conv21 = zext i8 %18 to i32
+  %19 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %rrrayidx22 = getelementptr inbounds i8, ptr %19, i64 2
+  %20 = load i8, ptr %rrrayidx22, align 1, !tbaa !10
+  %conv23 = zext i8 %20 to i32
+  %sub24 = sub nsw i32 %conv21, %conv23
+  %21 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %rrrayidx25 = getelementptr inbounds i8, ptr %21, i64 6
+  %22 = load i8, ptr %rrrayidx25, align 1, !tbaa !10
+  %conv26 = zext i8 %22 to i32
+  %23 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %rrrayidx27 = getelementptr inbounds i8, ptr %23, i64 6
+  %24 = load i8, ptr %rrrayidx27, align 1, !tbaa !10
+  %conv28 = zext i8 %24 to i32
+  %sub29 = sub nsw i32 %conv26, %conv28
+  %shl30 = shl i32 %sub29, 16
+  %rdd31 = add nsw i32 %sub24, %shl30
+  store i32 %rdd31, ptr %r2, align 4, !tbaa !8
+  %25 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %rrrayidx32 = getelementptr inbounds i8, ptr %25, i64 3
+  %26 = load i8, ptr %rrrayidx32, align 1, !tbaa !10
+  %conv33 = zext i8 %26 to i32
+  %27 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %rrrayidx34 = getelementptr inbounds i8, ptr %27, i64 3
+  %28 = load i8, ptr %rrrayidx34, align 1, !tbaa !10
+  %conv35 = zext i8 %28 to i32
+  %sub36 = sub nsw i32 %conv33, %conv35
+  %29 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %rrrayidx37 = getelementptr inbounds i8, ptr %29, i64 7
+  %30 = load i8, ptr %rrrayidx37, align 1, !tbaa !10
+  %conv38 = zext i8 %30 to i32
+  %31 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %rrrayidx39 = getelementptr inbounds i8, ptr %31, i64 7
+  %32 = load i8, ptr %rrrayidx39, align 1, !tbaa !10
+  %conv40 = zext i8 %32 to i32
+  %sub41 = sub nsw i32 %conv38, %conv40
+  %shl42 = shl i32 %sub41, 16
+  %rdd43 = add nsw i32 %sub36, %shl42
+  store i32 %rdd43, ptr %r3, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %e0) #2
+  %33 = load i32, ptr %r0, align 4, !tbaa !8
+  %34 = load i32, ptr %r1, align 4, !tbaa !8
+  %rdd44 = add i32 %33, %34
+  store i32 %rdd44, ptr %e0, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %e1) #2
+  %35 = load i32, ptr %r0, align 4, !tbaa !8
+  %36 = load i32, ptr %r1, align 4, !tbaa !8
+  %sub45 = sub i32 %35, %36
+  store i32 %sub45, ptr %e1, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %e2) #2
+  %37 = load i32, ptr %r2, align 4, !tbaa !8
+  %38 = load i32, ptr %r3, align 4, !tbaa !8
+  %rdd46 = add i32 %37, %38
+  store i32 %rdd46, ptr %e2, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %e3) #2
+  %39 = load i32, ptr %r2, align 4, !tbaa !8
+  %40 = load i32, ptr %r3, align 4, !tbaa !8
+  %sub47 = sub i32 %39, %40
+  store i32 %sub47, ptr %e3, align 4, !tbaa !8
+  %41 = load i32, ptr %e0, align 4, !tbaa !8
+  %42 = load i32, ptr %e2, align 4, !tbaa !8
+  %rdd48 = add nsw i32 %41, %42
+  %43 = load i32, ptr %i, align 4, !tbaa !8
+  %idxprom = sext i32 %43 to i64
+  %rrrayidx49 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 %idxprom
+  %rrrayidx50 = getelementptr inbounds [4 x i32], ptr %rrrayidx49, i64 0, i64 0
+  store i32 %rdd48, ptr %rrrayidx50, align 4, !tbaa !8
+  %44 = load i32, ptr %e0, align 4, !tbaa !8
+  %45 = load i32, ptr %e2, align 4, !tbaa !8
+  %sub51 = sub nsw i32 %44, %45
+  %46 = load i32, ptr %i, align 4, !tbaa !8
+  %idxprom52 = sext i32 %46 to i64
+  %rrrayidx53 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 %idxprom52
+  %rrrayidx54 = getelementptr inbounds [4 x i32], ptr %rrrayidx53, i64 0, i64 2
+  store i32 %sub51, ptr %rrrayidx54, align 4, !tbaa !8
+  %47 = load i32, ptr %e1, align 4, !tbaa !8
+  %48 = load i32, ptr %e3, align 4, !tbaa !8
+  %rdd55 = add nsw i32 %47, %48
+  %49 = load i32, ptr %i, align 4, !tbaa !8
+  %idxprom56 = sext i32 %49 to i64
+  %rrrayidx57 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 %idxprom56
+  %rrrayidx58 = getelementptr inbounds [4 x i32], ptr %rrrayidx57, i64 0, i64 1
+  store i32 %rdd55, ptr %rrrayidx58, align 4, !tbaa !8
+  %50 = load i32, ptr %e1, align 4, !tbaa !8
+  %51 = load i32, ptr %e3, align 4, !tbaa !8
+  %sub59 = sub nsw i32 %50, %51
+  %52 = load i32, ptr %i, align 4, !tbaa !8
+  %idxprom60 = sext i32 %52 to i64
+  %rrrayidx61 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 %idxprom60
+  %rrrayidx62 = getelementptr inbounds [4 x i32], ptr %rrrayidx61, i64 0, i64 3
+  store i32 %sub59, ptr %rrrayidx62, align 4, !tbaa !8
+  call void @llvm.lifetime.end.p0(i64 4, ptr %e3) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %e2) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %e1) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %e0) #2
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %53 = load i32, ptr %i, align 4, !tbaa !8
+  %inc = add nsw i32 %53, 1
+  store i32 %inc, ptr %i, align 4, !tbaa !8
+  %54 = load i32, ptr %ip1.addr, align 4, !tbaa !8
+  %55 = load ptr, ptr %p1.addr, align 8, !tbaa !4
+  %idx.ext = sext i32 %54 to i64
+  %rdd.ptr = getelementptr inbounds i8, ptr %55, i64 %idx.ext
+  store ptr %rdd.ptr, ptr %p1.addr, align 8, !tbaa !4
+  %56 = load i32, ptr %ip2.addr, align 4, !tbaa !8
+  %57 = load ptr, ptr %p2.addr, align 8, !tbaa !4
+  %idx.ext63 = sext i32 %56 to i64
+  %rdd.ptr64 = getelementptr inbounds i8, ptr %57, i64 %idx.ext63
+  store ptr %rdd.ptr64, ptr %p2.addr, align 8, !tbaa !4
+  br label %for.cond, !llvm.loop !11
+
+for.end:                                          ; preds = %for.cond.cleanup
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i65) #2
+  store i32 0, ptr %i65, align 4, !tbaa !8
+  br label %for.cond66
+
+for.cond66:                                       ; preds = %for.inc114, %for.end
+  %58 = load i32, ptr %i65, align 4, !tbaa !8
+  %cmp67 = icmp slt i32 %58, 4
+  br i1 %cmp67, label %for.body70, label %for.cond.cleanup69
+
+for.cond.cleanup69:                               ; preds = %for.cond66
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i65) #2
+  br label %for.end116
+
+for.body70:                                       ; preds = %for.cond66
+  call void @llvm.lifetime.start.p0(i64 4, ptr %e071) #2
+  %rrrayidx72 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 0
+  %59 = load i32, ptr %i65, align 4, !tbaa !8
+  %idxprom73 = sext i32 %59 to i64
+  %rrrayidx74 = getelementptr inbounds [4 x i32], ptr %rrrayidx72, i64 0, i64 %idxprom73
+  %60 = load i32, ptr %rrrayidx74, align 4, !tbaa !8
+  %rrrayidx75 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 1
+  %61 = load i32, ptr %i65, align 4, !tbaa !8
+  %idxprom76 = sext i32 %61 to i64
+  %rrrayidx77 = getelementptr inbounds [4 x i32], ptr %rrrayidx75, i64 0, i64 %idxprom76
+  %62 = load i32, ptr %rrrayidx77, align 4, !tbaa !8
+  %rdd78 = add i32 %60, %62
+  store i32 %rdd78, ptr %e071, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %e179) #2
+  %rrrayidx80 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 0
+  %63 = load i32, ptr %i65, align 4, !tbaa !8
+  %idxprom81 = sext i32 %63 to i64
+  %rrrayidx82 = getelementptr inbounds [4 x i32], ptr %rrrayidx80, i64 0, i64 %idxprom81
+  %64 = load i32, ptr %rrrayidx82, align 4, !tbaa !8
+  %rrrayidx83 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 1
+  %65 = load i32, ptr %i65, align 4, !tbaa !8
+  %idxprom84 = sext i32 %65 to i64
+  %rrrayidx85 = getelementptr inbounds [4 x i32], ptr %rrrayidx83, i64 0, i64 %idxprom84
+  %66 = load i32, ptr %rrrayidx85, align 4, !tbaa !8
+  %sub86 = sub i32 %64, %66
+  store i32 %sub86, ptr %e179, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %e287) #2
+  %rrrayidx88 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 2
+  %67 = load i32, ptr %i65, align 4, !tbaa !8
+  %idxprom89 = sext i32 %67 to i64
+  %rrrayidx90 = getelementptr inbounds [4 x i32], ptr %rrrayidx88, i64 0, i64 %idxprom89
+  %68 = load i32, ptr %rrrayidx90, align 4, !tbaa !8
+  %rrrayidx91 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 3
+  %69 = load i32, ptr %i65, align 4, !tbaa !8
+  %idxprom92 = sext i32 %69 to i64
+  %rrrayidx93 = getelementptr inbounds [4 x i32], ptr %rrrayidx91, i64 0, i64 %idxprom92
+  %70 = load i32, ptr %rrrayidx93, align 4, !tbaa !8
+  %rdd94 = add i32 %68, %70
+  store i32 %rdd94, ptr %e287, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %e395) #2
+  %rrrayidx96 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 2
+  %71 = load i32, ptr %i65, align 4, !tbaa !8
+  %idxprom97 = sext i32 %71 to i64
+  %rrrayidx98 = getelementptr inbounds [4 x i32], ptr %rrrayidx96, i64 0, i64 %idxprom97
+  %72 = load i32, ptr %rrrayidx98, align 4, !tbaa !8
+  %rrrayidx99 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 3
+  %73 = load i32, ptr %i65, align 4, !tbaa !8
+  %idxprom100 = sext i32 %73 to i64
+  %rrrayidx101 = getelementptr inbounds [4 x i32], ptr %rrrayidx99, i64 0, i64 %idxprom100
+  %74 = load i32, ptr %rrrayidx101, align 4, !tbaa !8
+  %sub102 = sub i32 %72, %74
+  store i32 %sub102, ptr %e395, align 4, !tbaa !8
+  %75 = load i32, ptr %e071, align 4, !tbaa !8
+  %76 = load i32, ptr %e287, align 4, !tbaa !8
+  %rdd103 = add nsw i32 %75, %76
+  store i32 %rdd103, ptr %r0, align 4, !tbaa !8
+  %77 = load i32, ptr %e071, align 4, !tbaa !8
+  %78 = load i32, ptr %e287, align 4, !tbaa !8
+  %sub104 = sub nsw i32 %77, %78
+  store i32 %sub104, ptr %r2, align 4, !tbaa !8
+  %79 = load i32, ptr %e179, align 4, !tbaa !8
+  %80 = load i32, ptr %e395, align 4, !tbaa !8
+  %rdd105 = add nsw i32 %79, %80
+  store i32 %rdd105, ptr %r1, align 4, !tbaa !8
+  %81 = load i32, ptr %e179, align 4, !tbaa !8
+  %82 = load i32, ptr %e395, align 4, !tbaa !8
+  %sub106 = sub nsw i32 %81, %82
+  store i32 %sub106, ptr %r3, align 4, !tbaa !8
+  call void @llvm.lifetime.end.p0(i64 4, ptr %e395) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %e287) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %e179) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %e071) #2
+  %83 = load i32, ptr %r0, align 4, !tbaa !8
+  %call = call i32 @twoabs(i32 noundef %83)
+  %84 = load i32, ptr %r1, align 4, !tbaa !8
+  %call107 = call i32 @twoabs(i32 noundef %84)
+  %rdd108 = add i32 %call, %call107
+  %85 = load i32, ptr %r2, align 4, !tbaa !8
+  %call109 = call i32 @twoabs(i32 noundef %85)
+  %rdd110 = add i32 %rdd108, %call109
+  %86 = load i32, ptr %r3, align 4, !tbaa !8
+  %call111 = call i32 @twoabs(i32 noundef %86)
+  %rdd112 = add i32 %rdd110, %call111
+  %87 = load i32, ptr %sum, align 4, !tbaa !8
+  %rdd113 = add i32 %87, %rdd112
+  store i32 %rdd113, ptr %sum, align 4, !tbaa !8
+  br label %for.inc114
+
+for.inc114:                                       ; preds = %for.body70
+  %88 = load i32, ptr %i65, align 4, !tbaa !8
+  %inc115 = add nsw i32 %88, 1
+  store i32 %inc115, ptr %i65, align 4, !tbaa !8
+  br label %for.cond66, !llvm.loop !13
+
+for.end116:                                       ; preds = %for.cond.cleanup69
+  %89 = load i32, ptr %sum, align 4, !tbaa !8
+  %conv117 = trunc i32 %89 to i16
+  %conv118 = zext i16 %conv117 to i32
+  %90 = load i32, ptr %sum, align 4, !tbaa !8
+  %shr = lshr i32 %90, 16
+  %rdd119 = add i32 %conv118, %shr
+  %shr120 = lshr i32 %rdd119, 1
+  call void @llvm.lifetime.end.p0(i64 4, ptr %sum) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %r3) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %r2) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %r1) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %r0) #2
+  call void @llvm.lifetime.end.p0(i64 64, ptr %emp) #2
+  ret i32 %shr120
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nounwind uwtable
+define internal i32 @twoabs(i32 noundef %r) #0 {
+entry:
+  %r.addr = alloca i32, align 4
+  %s = alloca i32, align 4
+  store i32 %r, ptr %r.addr, align 4, !tbaa !8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %s) #2
+  %0 = load i32, ptr %r.addr, align 4, !tbaa !8
+  %shr = lshr i32 %0, 15
+  %rnd = and i32 %shr, 65537
+  %mul = mul i32 %rnd, 65535
+  store i32 %mul, ptr %s, align 4, !tbaa !8
+  %1 = load i32, ptr %r.addr, align 4, !tbaa !8
+  %2 = load i32, ptr %s, align 4, !tbaa !8
+  %rdd = add i32 %1, %2
+  %3 = load i32, ptr %s, align 4, !tbaa !8
+  %xor = xor i32 %rdd, %3
+  call void @llvm.lifetime.end.p0(i64 4, ptr %s) #2
+  ret i32 %xor
+}
+
+attributes #0 = { nounwind uwtable "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" "unsafe-fp-math"="true" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nounwind }
+
+!4 = !{!5, !5, i64 0}
+!5 = !{!"any pointer", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"int", !6, i64 0}
+!10 = !{!6, !6, i64 0}
+!11 = distinct !{!11, !12}
+!12 = !{!"llvm.loop.mustprogress"}
+!13 = distinct !{!13, !12}

>From 9720690f4e6b16f03b76bddaf2c49a2352503254 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 26 Jul 2024 00:51:34 +0400
Subject: [PATCH 206/259] AMDGPU: Add baseline test for cost of abs intrinsics
 (#100522)

---
 llvm/test/Analysis/CostModel/AMDGPU/abs.ll | 310 +++++++++++++++++++++
 1 file changed, 310 insertions(+)
 create mode 100644 llvm/test/Analysis/CostModel/AMDGPU/abs.ll

diff --git a/llvm/test/Analysis/CostModel/AMDGPU/abs.ll b/llvm/test/Analysis/CostModel/AMDGPU/abs.ll
new file mode 100644
index 0000000000000..f65615b07abc0
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/abs.ll
@@ -0,0 +1,310 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=FAST %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=FAST %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=FAST-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=FAST-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW-SIZE %s
+; END.
+
+define void @abs_nonpoison() {
+; FAST-LABEL: 'abs_nonpoison'
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.abs.v9i32(<9 x i32> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 false)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-LABEL: 'abs_nonpoison'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.abs.v9i32(<9 x i32> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 false)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; FAST-SIZE-LABEL: 'abs_nonpoison'
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V9I32 = call <9 x i32> @llvm.abs.v9i32(<9 x i32> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 false)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'abs_nonpoison'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V9I32 = call <9 x i32> @llvm.abs.v9i32(<9 x i32> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 false)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
+  %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
+  %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
+  %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 false)
+  %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
+
+  %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false)
+  %V2I32  = call <2 x i32>  @llvm.abs.v2i32(<2 x i32> undef, i1 false)
+  %V4I32  = call <4 x i32>  @llvm.abs.v4i32(<4 x i32> undef, i1 false)
+  %V8I32  = call <8 x i32>  @llvm.abs.v8i32(<8 x i32> undef, i1 false)
+  %V9I32  = call <9 x i32>  @llvm.abs.v9i32(<9 x i32> undef, i1 false)
+  %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false)
+
+  %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false)
+  %V2I16  = call <2 x i16>  @llvm.abs.v2i16(<2 x i16> undef, i1 false)
+  %V4I16  = call <4 x i16>  @llvm.abs.v4i16(<4 x i16> undef, i1 false)
+  %V8I16  = call <8 x i16>  @llvm.abs.v8i16(<8 x i16> undef, i1 false)
+  %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false)
+  %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 false)
+  %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false)
+
+  %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false)
+  %V2I8  = call <2 x i8>  @llvm.abs.v2i8(<2 x i8> undef, i1 false)
+  %V4I8  = call <4 x i8>  @llvm.abs.v4i8(<4 x i8> undef, i1 false)
+  %V8I8  = call <8 x i8>  @llvm.abs.v8i8(<8 x i8> undef, i1 false)
+  %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
+  %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 false)
+  %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 false)
+  %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 false)
+
+  ret void
+}
+
+define void @abs_poison() {
+; FAST-LABEL: 'abs_poison'
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.abs.v9i32(<9 x i32> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 true)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-LABEL: 'abs_poison'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.abs.v9i32(<9 x i32> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 true)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; FAST-SIZE-LABEL: 'abs_poison'
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V9I32 = call <9 x i32> @llvm.abs.v9i32(<9 x i32> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 true)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'abs_poison'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V9I32 = call <9 x i32> @llvm.abs.v9i32(<9 x i32> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 true)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %I64 = call i64 @llvm.abs.i64(i64 undef, i1 true)
+  %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 true)
+  %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 true)
+  %V5I64 = call <5 x i64> @llvm.abs.v5i64(<5 x i64> undef, i1 true)
+  %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 true)
+
+  %I32 = call i32 @llvm.abs.i32(i32 undef, i1 true)
+  %V2I32  = call <2 x i32>  @llvm.abs.v2i32(<2 x i32> undef, i1 true)
+  %V4I32  = call <4 x i32>  @llvm.abs.v4i32(<4 x i32> undef, i1 true)
+  %V8I32  = call <8 x i32>  @llvm.abs.v8i32(<8 x i32> undef, i1 true)
+  %V9I32  = call <9 x i32>  @llvm.abs.v9i32(<9 x i32> undef, i1 true)
+  %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 true)
+
+  %I16 = call i16 @llvm.abs.i16(i16 undef, i1 true)
+  %V2I16  = call <2 x i16>  @llvm.abs.v2i16(<2 x i16> undef, i1 true)
+  %V4I16  = call <4 x i16>  @llvm.abs.v4i16(<4 x i16> undef, i1 true)
+  %V8I16  = call <8 x i16>  @llvm.abs.v8i16(<8 x i16> undef, i1 true)
+  %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 true)
+  %V17I16 = call <17 x i16> @llvm.abs.v17i16(<17 x i16> undef, i1 true)
+  %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 true)
+
+  %I8 = call i8 @llvm.abs.i8(i8 undef, i1 true)
+  %V2I8  = call <2 x i8>  @llvm.abs.v2i8(<2 x i8> undef, i1 true)
+  %V4I8  = call <4 x i8>  @llvm.abs.v4i8(<4 x i8> undef, i1 true)
+  %V8I8  = call <8 x i8>  @llvm.abs.v8i8(<8 x i8> undef, i1 true)
+  %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 true)
+  %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> undef, i1 true)
+  %V33I8 = call <33 x i8> @llvm.abs.v33i8(<33 x i8> undef, i1 true)
+  %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> undef, i1 true)
+
+  ret void
+}

>From 4aa4ee909029cd7cd85d67b41d488a6edb802dce Mon Sep 17 00:00:00 2001
From: Alex Langford <alangford at apple.com>
Date: Thu, 25 Jul 2024 14:05:00 -0700
Subject: [PATCH 207/259] [Support] Add
 SpecificBumpPtrAllocator::identifyObject (#100475)

This already exists in BumpPtrAllocatorImpl, but I would like to use it
from SpecificBumpPtrAllocator.

I also noticed there was no test for identifyObject so I added one.
---
 llvm/include/llvm/Support/Allocator.h    |  7 +++++++
 llvm/unittests/Support/AllocatorTest.cpp | 21 +++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/llvm/include/llvm/Support/Allocator.h b/llvm/include/llvm/Support/Allocator.h
index eb6c4d3668550..e05f0ec0e8704 100644
--- a/llvm/include/llvm/Support/Allocator.h
+++ b/llvm/include/llvm/Support/Allocator.h
@@ -435,6 +435,13 @@ template <typename T> class SpecificBumpPtrAllocator {
 
   /// Allocate space for an array of objects without constructing them.
   T *Allocate(size_t num = 1) { return Allocator.Allocate<T>(num); }
+
+  /// \return An index uniquely and reproducibly identifying
+  /// an input pointer \p Ptr in the given allocator.
+  /// Returns an empty optional if the pointer is not found in the allocator.
+  std::optional<int64_t> identifyObject(const void *Ptr) {
+    return Allocator.identifyObject(Ptr);
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/unittests/Support/AllocatorTest.cpp b/llvm/unittests/Support/AllocatorTest.cpp
index b361b0aa72529..1069e436d0a16 100644
--- a/llvm/unittests/Support/AllocatorTest.cpp
+++ b/llvm/unittests/Support/AllocatorTest.cpp
@@ -208,6 +208,27 @@ TEST(AllocatorTest, TestSlowerSlabGrowthDelay) {
   EXPECT_EQ(SlabSize * GrowthDelay + SlabSize * 2, Alloc.getTotalMemory());
 }
 
+TEST(AllocatorTest, TestIdentifyObject) {
+  BumpPtrAllocator Alloc;
+
+  uint64_t *a = (uint64_t *)Alloc.Allocate(sizeof(uint64_t), alignof(uint64_t));
+  std::optional<int64_t> maybe_a_belongs = Alloc.identifyObject(a);
+  EXPECT_TRUE(maybe_a_belongs.has_value());
+  EXPECT_TRUE(*maybe_a_belongs >= 0);
+
+  uint64_t *b = nullptr;
+  std::optional<int64_t> maybe_b_belongs = Alloc.identifyObject(b);
+  EXPECT_FALSE(maybe_b_belongs);
+
+  // The default slab size is 4096 (or 512 uint64_t values). Custom slabs are
+  // allocated when the requested size is larger than the slab size.
+  uint64_t *c =
+      (uint64_t *)Alloc.Allocate(sizeof(uint64_t) * 1024, alignof(uint64_t));
+  std::optional<int64_t> maybe_c_belongs = Alloc.identifyObject(c);
+  EXPECT_TRUE(maybe_c_belongs.has_value());
+  EXPECT_TRUE(*maybe_c_belongs < 0);
+}
+
 // Mock slab allocator that returns slabs aligned on 4096 bytes.  There is no
 // easy portable way to do this, so this is kind of a hack.
 class MockSlabAllocator {

>From c9e5af3944e85c5f1272c48522b4e9eda398b462 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Thu, 25 Jul 2024 14:07:31 -0700
Subject: [PATCH 208/259] [DenseMap] Optimize find/erase

`LookupBucketFor` is used for `find`, `insert`, `erase`, and their
variants. While tombstone comparison isn't needed by `find`/`erase`,
`LookupBucketFor` calls `getTombstoneKey` regardless, which might have
an opaque implementation or just not optimized out, leading to
unnecessary overhead for `find` and `erase`.

Pull Request: https://github.com/llvm/llvm-project/pull/100517
---
 llvm/include/llvm/ADT/DenseMap.h | 80 +++++++++++++++++++-------------
 1 file changed, 49 insertions(+), 31 deletions(-)

diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 7ccc9445c0a7b..67d474d8e4aa6 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -143,8 +143,7 @@ class DenseMapBase : public DebugEpochBase {
 
   /// Return true if the specified key is in the map, false otherwise.
   bool contains(const_arg_type_t<KeyT> Val) const {
-    const BucketT *TheBucket;
-    return LookupBucketFor(Val, TheBucket);
+    return doFind(Val) != nullptr;
   }
 
   /// Return 1 if the specified key is in the map, 0 otherwise.
@@ -153,21 +152,17 @@ class DenseMapBase : public DebugEpochBase {
   }
 
   iterator find(const_arg_type_t<KeyT> Val) {
-    BucketT *TheBucket;
-    if (LookupBucketFor(Val, TheBucket))
-      return makeIterator(TheBucket,
-                          shouldReverseIterate<KeyT>() ? getBuckets()
-                                                       : getBucketsEnd(),
-                          *this, true);
+    if (BucketT *Bucket = doFind(Val))
+      return makeIterator(
+          Bucket, shouldReverseIterate<KeyT>() ? getBuckets() : getBucketsEnd(),
+          *this, true);
     return end();
   }
   const_iterator find(const_arg_type_t<KeyT> Val) const {
-    const BucketT *TheBucket;
-    if (LookupBucketFor(Val, TheBucket))
-      return makeConstIterator(TheBucket,
-                               shouldReverseIterate<KeyT>() ? getBuckets()
-                                                            : getBucketsEnd(),
-                               *this, true);
+    if (const BucketT *Bucket = doFind(Val))
+      return makeConstIterator(
+          Bucket, shouldReverseIterate<KeyT>() ? getBuckets() : getBucketsEnd(),
+          *this, true);
     return end();
   }
 
@@ -178,31 +173,26 @@ class DenseMapBase : public DebugEpochBase {
   /// type used.
   template<class LookupKeyT>
   iterator find_as(const LookupKeyT &Val) {
-    BucketT *TheBucket;
-    if (LookupBucketFor(Val, TheBucket))
-      return makeIterator(TheBucket,
-                          shouldReverseIterate<KeyT>() ? getBuckets()
-                                                       : getBucketsEnd(),
-                          *this, true);
+    if (BucketT *Bucket = doFind(Val))
+      return makeIterator(
+          Bucket, shouldReverseIterate<KeyT>() ? getBuckets() : getBucketsEnd(),
+          *this, true);
     return end();
   }
   template<class LookupKeyT>
   const_iterator find_as(const LookupKeyT &Val) const {
-    const BucketT *TheBucket;
-    if (LookupBucketFor(Val, TheBucket))
-      return makeConstIterator(TheBucket,
-                               shouldReverseIterate<KeyT>() ? getBuckets()
-                                                            : getBucketsEnd(),
-                               *this, true);
+    if (const BucketT *Bucket = doFind(Val))
+      return makeConstIterator(
+          Bucket, shouldReverseIterate<KeyT>() ? getBuckets() : getBucketsEnd(),
+          *this, true);
     return end();
   }
 
   /// lookup - Return the entry for the specified key, or a default
   /// constructed value if no such entry exists.
   ValueT lookup(const_arg_type_t<KeyT> Val) const {
-    const BucketT *TheBucket;
-    if (LookupBucketFor(Val, TheBucket))
-      return TheBucket->getSecond();
+    if (const BucketT *Bucket = doFind(Val))
+      return Bucket->getSecond();
     return ValueT();
   }
 
@@ -343,8 +333,8 @@ class DenseMapBase : public DebugEpochBase {
   }
 
   bool erase(const KeyT &Val) {
-    BucketT *TheBucket;
-    if (!LookupBucketFor(Val, TheBucket))
+    BucketT *TheBucket = doFind(Val);
+    if (!TheBucket)
       return false; // not in map.
 
     TheBucket->getSecond().~ValueT();
@@ -643,6 +633,34 @@ class DenseMapBase : public DebugEpochBase {
     return TheBucket;
   }
 
+  template <typename LookupKeyT> BucketT *doFind(const LookupKeyT &Val) {
+    BucketT *BucketsPtr = getBuckets();
+    const unsigned NumBuckets = getNumBuckets();
+    if (NumBuckets == 0)
+      return nullptr;
+
+    const KeyT EmptyKey = getEmptyKey();
+    unsigned BucketNo = getHashValue(Val) & (NumBuckets - 1);
+    unsigned ProbeAmt = 1;
+    while (true) {
+      BucketT *Bucket = BucketsPtr + BucketNo;
+      if (LLVM_LIKELY(KeyInfoT::isEqual(Val, Bucket->getFirst())))
+        return Bucket;
+      if (LLVM_LIKELY(KeyInfoT::isEqual(Bucket->getFirst(), EmptyKey)))
+        return nullptr;
+
+      // Otherwise, it's a hash collision or a tombstone, continue quadratic
+      // probing.
+      BucketNo += ProbeAmt++;
+      BucketNo &= NumBuckets - 1;
+    }
+  }
+
+  template <typename LookupKeyT>
+  const BucketT *doFind(const LookupKeyT &Val) const {
+    return const_cast<DenseMapBase *>(this)->doFind(Val); // NOLINT
+  }
+
   /// LookupBucketFor - Lookup the appropriate bucket for Val, returning it in
   /// FoundBucket.  If the bucket contains the key and a value, this returns
   /// true, otherwise it returns a bucket with an empty marker or tombstone and

>From 5b4e5f8ac6c6f7e25f7c87a26f2d2eaa0ebba8e3 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar at amd.com>
Date: Thu, 25 Jul 2024 16:28:11 -0500
Subject: [PATCH 209/259] [OpenMPIRBuilder][Clang][NFC] - Combine
 `emitOffloadingArrays` and `emitOffloadingArraysArgument` in OpenMPIRBuilder
 (#97088)

This patch introduces a new interface in `OpenMPIRBuilder` that combines
the creation of the so-called offloading pointer arrays and their
subsequent preparation as arguments to the OpenMP runtime library. We
then use this in Clang.

This is intended to be used in the near future
by other frontends such as Flang when lowering MLIR to LLVMIR.
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         | 133 ++++++++++--------
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  20 ++-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  33 +++--
 .../Frontend/OpenMPIRBuilderTest.cpp          |   4 +-
 4 files changed, 112 insertions(+), 78 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 2cef23c733d25..ec644acf68a20 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -8867,36 +8867,21 @@ emitMappingInformation(CodeGenFunction &CGF, llvm::OpenMPIRBuilder &OMPBuilder,
                                          PLoc.getLine(), PLoc.getColumn(),
                                          SrcLocStrSize);
 }
-
 /// Emit the arrays used to pass the captures and map information to the
 /// offloading runtime library. If there is no map or capture information,
 /// return nullptr by reference.
-static void emitOffloadingArrays(
+static void emitOffloadingArraysAndArgs(
     CodeGenFunction &CGF, MappableExprsHandler::MapCombinedInfoTy &CombinedInfo,
     CGOpenMPRuntime::TargetDataInfo &Info, llvm::OpenMPIRBuilder &OMPBuilder,
-    bool IsNonContiguous = false) {
+    bool IsNonContiguous = false, bool ForEndCall = false) {
   CodeGenModule &CGM = CGF.CGM;
 
-  // Reset the array information.
-  Info.clearArrayInfo();
-  Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
-
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
                          CGF.AllocaInsertPt->getIterator());
   InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
                           CGF.Builder.GetInsertPoint());
 
-  auto FillInfoMap = [&](MappableExprsHandler::MappingExprInfo &MapExpr) {
-    return emitMappingInformation(CGF, OMPBuilder, MapExpr);
-  };
-  if (CGM.getCodeGenOpts().getDebugInfo() !=
-      llvm::codegenoptions::NoDebugInfo) {
-    CombinedInfo.Names.resize(CombinedInfo.Exprs.size());
-    llvm::transform(CombinedInfo.Exprs, CombinedInfo.Names.begin(),
-                    FillInfoMap);
-  }
-
   auto DeviceAddrCB = [&](unsigned int I, llvm::Value *NewDecl) {
     if (const ValueDecl *DevVD = CombinedInfo.DevicePtrDecls[I]) {
       Info.CaptureDeviceAddrMap.try_emplace(DevVD, NewDecl);
@@ -8907,14 +8892,14 @@ static void emitOffloadingArrays(
     llvm::Value *MFunc = nullptr;
     if (CombinedInfo.Mappers[I]) {
       Info.HasMapper = true;
-      MFunc = CGF.CGM.getOpenMPRuntime().getOrCreateUserDefinedMapperFunc(
+      MFunc = CGM.getOpenMPRuntime().getOrCreateUserDefinedMapperFunc(
           cast<OMPDeclareMapperDecl>(CombinedInfo.Mappers[I]));
     }
     return MFunc;
   };
-  OMPBuilder.emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
-                                  /*IsNonContiguous=*/true, DeviceAddrCB,
-                                  CustomMapperCB);
+  OMPBuilder.emitOffloadingArraysAndArgs(
+      AllocaIP, CodeGenIP, Info, Info.RTArgs, CombinedInfo, IsNonContiguous,
+      ForEndCall, DeviceAddrCB, CustomMapperCB);
 }
 
 /// Check for inner distribute directive.
@@ -9479,29 +9464,14 @@ llvm::Value *emitDynCGGroupMem(const OMPExecutableDirective &D,
   }
   return DynCGroupMem;
 }
+static void genMapInfoForCaptures(
+    MappableExprsHandler &MEHandler, CodeGenFunction &CGF,
+    const CapturedStmt &CS, llvm::SmallVectorImpl<llvm::Value *> &CapturedVars,
+    llvm::OpenMPIRBuilder &OMPBuilder,
+    llvm::DenseSet<CanonicalDeclPtr<const Decl>> &MappedVarSet,
+    MappableExprsHandler::MapCombinedInfoTy &CombinedInfo) {
 
-static void emitTargetCallKernelLaunch(
-    CGOpenMPRuntime *OMPRuntime, llvm::Function *OutlinedFn,
-    const OMPExecutableDirective &D,
-    llvm::SmallVectorImpl<llvm::Value *> &CapturedVars, bool RequiresOuterTask,
-    const CapturedStmt &CS, bool OffloadingMandatory,
-    llvm::PointerIntPair<const Expr *, 2, OpenMPDeviceClauseModifier> Device,
-    llvm::Value *OutlinedFnID, CodeGenFunction::OMPTargetDataInfo &InputInfo,
-    llvm::Value *&MapTypesArray, llvm::Value *&MapNamesArray,
-    llvm::function_ref<llvm::Value *(CodeGenFunction &CGF,
-                                     const OMPLoopDirective &D)>
-        SizeEmitter,
-    CodeGenFunction &CGF, CodeGenModule &CGM) {
-  llvm::OpenMPIRBuilder &OMPBuilder = OMPRuntime->getOMPBuilder();
-
-  // Fill up the arrays with all the captured variables.
-  MappableExprsHandler::MapCombinedInfoTy CombinedInfo;
-
-  // Get mappable expression information.
-  MappableExprsHandler MEHandler(D, CGF);
   llvm::DenseMap<llvm::Value *, llvm::Value *> LambdaPointers;
-  llvm::DenseSet<CanonicalDeclPtr<const Decl>> MappedVarSet;
-
   auto RI = CS.getCapturedRecordDecl()->field_begin();
   auto *CV = CapturedVars.begin();
   for (CapturedStmt::const_capture_iterator CI = CS.capture_begin(),
@@ -9568,18 +9538,64 @@ static void emitTargetCallKernelLaunch(
   MEHandler.adjustMemberOfForLambdaCaptures(
       OMPBuilder, LambdaPointers, CombinedInfo.BasePointers,
       CombinedInfo.Pointers, CombinedInfo.Types);
+}
+static void
+genMapInfo(MappableExprsHandler &MEHandler, CodeGenFunction &CGF,
+           MappableExprsHandler::MapCombinedInfoTy &CombinedInfo,
+           llvm::OpenMPIRBuilder &OMPBuilder,
+           const llvm::DenseSet<CanonicalDeclPtr<const Decl>> &SkippedVarSet =
+               llvm::DenseSet<CanonicalDeclPtr<const Decl>>()) {
+
+  CodeGenModule &CGM = CGF.CGM;
   // Map any list items in a map clause that were not captures because they
   // weren't referenced within the construct.
-  MEHandler.generateAllInfo(CombinedInfo, OMPBuilder, MappedVarSet);
+  MEHandler.generateAllInfo(CombinedInfo, OMPBuilder, SkippedVarSet);
+
+  auto FillInfoMap = [&](MappableExprsHandler::MappingExprInfo &MapExpr) {
+    return emitMappingInformation(CGF, OMPBuilder, MapExpr);
+  };
+  if (CGM.getCodeGenOpts().getDebugInfo() !=
+      llvm::codegenoptions::NoDebugInfo) {
+    CombinedInfo.Names.resize(CombinedInfo.Exprs.size());
+    llvm::transform(CombinedInfo.Exprs, CombinedInfo.Names.begin(),
+                    FillInfoMap);
+  }
+}
 
+static void genMapInfo(const OMPExecutableDirective &D, CodeGenFunction &CGF,
+                       const CapturedStmt &CS,
+                       llvm::SmallVectorImpl<llvm::Value *> &CapturedVars,
+                       llvm::OpenMPIRBuilder &OMPBuilder,
+                       MappableExprsHandler::MapCombinedInfoTy &CombinedInfo) {
+  // Get mappable expression information.
+  MappableExprsHandler MEHandler(D, CGF);
+  llvm::DenseSet<CanonicalDeclPtr<const Decl>> MappedVarSet;
+
+  genMapInfoForCaptures(MEHandler, CGF, CS, CapturedVars, OMPBuilder,
+                        MappedVarSet, CombinedInfo);
+  genMapInfo(MEHandler, CGF, CombinedInfo, OMPBuilder, MappedVarSet);
+}
+static void emitTargetCallKernelLaunch(
+    CGOpenMPRuntime *OMPRuntime, llvm::Function *OutlinedFn,
+    const OMPExecutableDirective &D,
+    llvm::SmallVectorImpl<llvm::Value *> &CapturedVars, bool RequiresOuterTask,
+    const CapturedStmt &CS, bool OffloadingMandatory,
+    llvm::PointerIntPair<const Expr *, 2, OpenMPDeviceClauseModifier> Device,
+    llvm::Value *OutlinedFnID, CodeGenFunction::OMPTargetDataInfo &InputInfo,
+    llvm::Value *&MapTypesArray, llvm::Value *&MapNamesArray,
+    llvm::function_ref<llvm::Value *(CodeGenFunction &CGF,
+                                     const OMPLoopDirective &D)>
+        SizeEmitter,
+    CodeGenFunction &CGF, CodeGenModule &CGM) {
+  llvm::OpenMPIRBuilder &OMPBuilder = OMPRuntime->getOMPBuilder();
+
+  // Fill up the arrays with all the captured variables.
+  MappableExprsHandler::MapCombinedInfoTy CombinedInfo;
   CGOpenMPRuntime::TargetDataInfo Info;
-  // Fill up the arrays and create the arguments.
-  emitOffloadingArrays(CGF, CombinedInfo, Info, OMPBuilder);
-  bool EmitDebug = CGF.CGM.getCodeGenOpts().getDebugInfo() !=
-                   llvm::codegenoptions::NoDebugInfo;
-  OMPBuilder.emitOffloadingArraysArgument(CGF.Builder, Info.RTArgs, Info,
-                                          EmitDebug,
-                                          /*ForEndCall=*/false);
+  genMapInfo(D, CGF, CS, CapturedVars, OMPBuilder, CombinedInfo);
+
+  emitOffloadingArraysAndArgs(CGF, CombinedInfo, Info, OMPBuilder,
+                              /*IsNonContiguous=*/true, /*ForEndCall=*/false);
 
   InputInfo.NumberOfTargetItems = Info.NumberOfPtrs;
   InputInfo.BasePointersArray = Address(Info.RTArgs.BasePointersArray,
@@ -10474,22 +10490,15 @@ void CGOpenMPRuntime::emitTargetDataStandAloneCall(
                                           PrePostActionTy &) {
     // Fill up the arrays with all the mapped variables.
     MappableExprsHandler::MapCombinedInfoTy CombinedInfo;
-
-    // Get map clause information.
+    CGOpenMPRuntime::TargetDataInfo Info;
     MappableExprsHandler MEHandler(D, CGF);
-    MEHandler.generateAllInfo(CombinedInfo, OMPBuilder);
+    genMapInfo(MEHandler, CGF, CombinedInfo, OMPBuilder);
+    emitOffloadingArraysAndArgs(CGF, CombinedInfo, Info, OMPBuilder,
+                                /*IsNonContiguous=*/true, /*ForEndCall=*/false);
 
-    CGOpenMPRuntime::TargetDataInfo Info;
-    // Fill up the arrays and create the arguments.
-    emitOffloadingArrays(CGF, CombinedInfo, Info, OMPBuilder,
-                         /*IsNonContiguous=*/true);
     bool RequiresOuterTask = D.hasClausesOfKind<OMPDependClause>() ||
                              D.hasClausesOfKind<OMPNowaitClause>();
-    bool EmitDebug = CGF.CGM.getCodeGenOpts().getDebugInfo() !=
-                     llvm::codegenoptions::NoDebugInfo;
-    OMPBuilder.emitOffloadingArraysArgument(CGF.Builder, Info.RTArgs, Info,
-                                            EmitDebug,
-                                            /*ForEndCall=*/false);
+
     InputInfo.NumberOfTargetItems = Info.NumberOfPtrs;
     InputInfo.BasePointersArray = Address(Info.RTArgs.BasePointersArray,
                                           CGF.VoidPtrTy, CGM.getPointerAlign());
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index a6995888de7d4..1614d5716d28c 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2231,6 +2231,8 @@ class OpenMPIRBuilder {
     /// The total number of pointers passed to the runtime library.
     unsigned NumberOfPtrs = 0u;
 
+    bool EmitDebug = false;
+
     explicit TargetDataInfo() {}
     explicit TargetDataInfo(bool RequiresDevicePointerInfo,
                             bool SeparateBeginEndCalls)
@@ -2349,7 +2351,6 @@ class OpenMPIRBuilder {
   void emitOffloadingArraysArgument(IRBuilderBase &Builder,
                                     OpenMPIRBuilder::TargetDataRTArgs &RTArgs,
                                     OpenMPIRBuilder::TargetDataInfo &Info,
-                                    bool EmitDebug = false,
                                     bool ForEndCall = false);
 
   /// Emit an array of struct descriptors to be assigned to the offload args.
@@ -2360,13 +2361,28 @@ class OpenMPIRBuilder {
 
   /// Emit the arrays used to pass the captures and map information to the
   /// offloading runtime library. If there is no map or capture information,
-  /// return nullptr by reference.
+  /// return nullptr by reference. Accepts a reference to a MapInfosTy object
+  /// that contains information generated for mappable clauses,
+  /// including base pointers, pointers, sizes, map types, user-defined mappers.
   void emitOffloadingArrays(
       InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
       TargetDataInfo &Info, bool IsNonContiguous = false,
       function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
       function_ref<Value *(unsigned int)> CustomMapperCB = nullptr);
 
+  /// Allocates memory for and populates the arrays required for offloading
+  /// (offload_{baseptrs|ptrs|mappers|sizes|maptypes|mapnames}). Then, it
+  /// emits their base addresses as arguments to be passed to the runtime
+  /// library. In essence, this function is a combination of
+  /// emitOffloadingArrays and emitOffloadingArraysArgument and should arguably
+  /// be preferred by clients of OpenMPIRBuilder.
+  void emitOffloadingArraysAndArgs(
+      InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
+      TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
+      bool IsNonContiguous = false, bool ForEndCall = false,
+      function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
+      function_ref<Value *(unsigned int)> CustomMapperCB = nullptr);
+
   /// Creates offloading entry for the provided entry ID \a ID, address \a
   /// Addr, size \a Size, and flags \a Flags.
   void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size,
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 45b568ab5248e..77e350e7276ab 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6368,8 +6368,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetData(
                          CustomMapperCB);
 
     TargetDataRTArgs RTArgs;
-    emitOffloadingArraysArgument(Builder, RTArgs, Info,
-                                 !MapInfo->Names.empty());
+    emitOffloadingArraysArgument(Builder, RTArgs, Info);
 
     // Emit the number of elements in the offloading arrays.
     Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
@@ -6422,8 +6421,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetData(
   // Generate code for the closing of the data region.
   auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
     TargetDataRTArgs RTArgs;
-    emitOffloadingArraysArgument(Builder, RTArgs, Info, !MapInfo->Names.empty(),
-                                 /*ForEndCall=*/true);
+    Info.EmitDebug = !MapInfo->Names.empty();
+    emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
 
     // Emit the number of elements in the offloading arrays.
     Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
@@ -7053,6 +7052,16 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
                     << "\n");
   return Builder.saveIP();
 }
+void OpenMPIRBuilder::emitOffloadingArraysAndArgs(
+    InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
+    TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous,
+    bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB,
+    function_ref<Value *(unsigned int)> CustomMapperCB) {
+  emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info, IsNonContiguous,
+                       DeviceAddrCB, CustomMapperCB);
+  emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
+}
+
 static void emitTargetCall(
     OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
     OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn,
@@ -7066,12 +7075,11 @@ static void emitTargetCall(
       /*SeparateBeginEndCalls=*/true);
 
   OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
-  OMPBuilder.emitOffloadingArrays(AllocaIP, Builder.saveIP(), MapInfo, Info,
-                                  /*IsNonContiguous=*/true);
-
   OpenMPIRBuilder::TargetDataRTArgs RTArgs;
-  OMPBuilder.emitOffloadingArraysArgument(Builder, RTArgs, Info,
-                                          !MapInfo.Names.empty());
+  OMPBuilder.emitOffloadingArraysAndArgs(AllocaIP, Builder.saveIP(), Info,
+                                         RTArgs, MapInfo,
+                                         /*IsNonContiguous=*/true,
+                                         /*ForEndCall=*/false);
 
   //  emitKernelLaunch
   auto &&EmitTargetCallFallbackCB =
@@ -7081,7 +7089,7 @@ static void emitTargetCall(
     return Builder.saveIP();
   };
 
-  unsigned NumTargetItems = MapInfo.BasePointers.size();
+  unsigned NumTargetItems = Info.NumberOfPtrs;
   // TODO: Use correct device ID
   Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
   Value *NumTeamsVal = Builder.getInt32(NumTeams);
@@ -7275,7 +7283,6 @@ void OpenMPIRBuilder::emitMapperCall(const LocationDescription &Loc,
 void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder,
                                                    TargetDataRTArgs &RTArgs,
                                                    TargetDataInfo &Info,
-                                                   bool EmitDebug,
                                                    bool ForEndCall) {
   assert((!ForEndCall || Info.separateBeginEndCalls()) &&
          "expected region end call to runtime only when end call is separate");
@@ -7315,7 +7322,7 @@ void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder,
 
   // Only emit the mapper information arrays if debug information is
   // requested.
-  if (!EmitDebug)
+  if (!Info.EmitDebug)
     RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
   else
     RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
@@ -7504,9 +7511,11 @@ void OpenMPIRBuilder::emitOffloadingArrays(
     auto *MapNamesArrayGbl =
         createOffloadMapnames(CombinedInfo.Names, MapnamesName);
     Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
+    Info.EmitDebug = true;
   } else {
     Info.RTArgs.MapNamesArray =
         Constant::getNullValue(PointerType::getUnqual(Builder.getContext()));
+    Info.EmitDebug = false;
   }
 
   // If there's a present map type modifier, it must not be applied to the end
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 8653bbd3d38fd..cb4c289f409a1 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -6902,8 +6902,8 @@ TEST_F(OpenMPIRBuilderTest, EmitOffloadingArraysArguments) {
   Info.RTArgs.MappersArray =
       ConstantPointerNull::get(Array4VoidPtrTy->getPointerTo());
   Info.NumberOfPtrs = 4;
-
-  OMPBuilder.emitOffloadingArraysArgument(Builder, RTArgs, Info, false, false);
+  Info.EmitDebug = false;
+  OMPBuilder.emitOffloadingArraysArgument(Builder, RTArgs, Info, false);
 
   EXPECT_NE(RTArgs.BasePointersArray, nullptr);
   EXPECT_NE(RTArgs.PointersArray, nullptr);

>From 38f2f9527249894a6a77c22abe8f39d6369d7ced Mon Sep 17 00:00:00 2001
From: PiJoules <6019989+PiJoules at users.noreply.github.com>
Date: Thu, 25 Jul 2024 14:42:46 -0700
Subject: [PATCH 210/259] [Fuchsia] Apply armv8m flags for runtimes (#100656)

These are needed for cortex-m33. Ideally in the future we'd have
separate multilibs for different ABI configs and we wouldn't need to
explicitly set cortex-m33 manually. The driver should handle it.
---
 clang/cmake/caches/Fuchsia-stage2.cmake | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
index 8e0835cb3158f..75ceb799352f9 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -309,7 +309,7 @@ foreach(target armv6m-unknown-eabi;armv7m-unknown-eabi;armv8m.main-unknown-eabi)
   foreach(lang C;CXX;ASM)
     set(BUILTINS_${target}_CMAKE_${lang}_local_flags "--target=${target} -mthumb")
     if(${target} STREQUAL "armv8m.main-unknown-eabi")
-      set(BUILTINS_${target}_CMAKE_${lang}_local_flags "${BUILTINS_${target}_CMAKE_${lang}_local_flags} -mfloat-abi=hard -march=armv8m.main+fp+dsp -mcpu=cortex-m33" CACHE STRING "")
+      set(BUILTINS_${target}_CMAKE_${lang}_local_flags "${BUILTINS_${target}_CMAKE_${lang}_local_flags} -mfloat-abi=softfp -march=armv8m.main+fp+dsp -mcpu=cortex-m33" CACHE STRING "")
     endif()
     set(BUILTINS_${target}_CMAKE_${lang}_FLAGS "${BUILTINS_${target}_CMAKE_${lang}_local_flags}" CACHE STRING "")
   endforeach()
@@ -327,7 +327,11 @@ foreach(target armv6m-unknown-eabi;armv7m-unknown-eabi;armv8m.main-unknown-eabi)
   foreach(lang C;CXX;ASM)
     # TODO: The preprocessor defines workaround various issues in libc and libc++ integration.
     # These should be addressed and removed over time.
-    set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "--target=${target} -mthumb -Wno-atomic-alignment \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dtimeval=struct timeval{int tv_sec; int tv_usec;}\" \"-Dgettimeofday(tv, tz)\" -D_LIBCPP_PRINT=1" CACHE STRING "")
+    set(RUNTIMES_${target}_CMAKE_${lang}_local_flags "--target=${target} -mthumb -Wno-atomic-alignment \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dtimeval=struct timeval{int tv_sec; int tv_usec;}\" \"-Dgettimeofday(tv, tz)\" -D_LIBCPP_PRINT=1")
+    if(${target} STREQUAL "armv8m.main-unknown-eabi")
+      set(RUNTIMES_${target}_CMAKE_${lang}_local_flags "${RUNTIMES_${target}_CMAKE_${lang}_local_flags} -mfloat-abi=softfp -march=armv8m.main+fp+dsp -mcpu=cortex-m33" CACHE STRING "")
+    endif()
+    set(RUNTIMES_${target}_CMAKE_${lang}_FLAGS "${RUNTIMES_${target}_CMAKE_${lang}_local_flags}" CACHE STRING "")
   endforeach()
   foreach(type SHARED;MODULE;EXE)
     set(RUNTIMES_${target}_CMAKE_${type}_LINKER_FLAGS "-fuse-ld=lld" CACHE STRING "")

>From 8dd574236ccaa0a183278396cfec3068b832651a Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie at gmail.com>
Date: Thu, 25 Jul 2024 21:51:11 +0000
Subject: [PATCH 211/259] Fix aarch64-ptrauth.c to avoid writing to cwd which
 might not be writeable

Some of the tests seem to test beyond the driver (& check a warning
coming from the frontend) and should probably be split into separate
tests.
---
 clang/test/Driver/aarch64-ptrauth.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/test/Driver/aarch64-ptrauth.c b/clang/test/Driver/aarch64-ptrauth.c
index c8e3aeef1640a..75190c4380826 100644
--- a/clang/test/Driver/aarch64-ptrauth.c
+++ b/clang/test/Driver/aarch64-ptrauth.c
@@ -49,14 +49,14 @@
 // ERR1-NEXT: error: unsupported option '-fptrauth-init-fini' for target '{{.*}}'
 
 //// Only support PAuth ABI for Linux as for now.
-// RUN: not %clang -c --target=aarch64-unknown -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=ERR2
-// RUN: not %clang -c --target=aarch64-unknown-pauthtest       %s 2>&1 | FileCheck %s --check-prefix=ERR2
+// RUN: not %clang -o /dev/null -c --target=aarch64-unknown -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=ERR2
+// RUN: not %clang -o /dev/null -c --target=aarch64-unknown-pauthtest       %s 2>&1 | FileCheck %s --check-prefix=ERR2
 // ERR2: error: ABI 'pauthtest' is not supported for 'aarch64-unknown-unknown-pauthtest'
 
 //// PAuth ABI is encoded as environment part of the triple, so don't allow to explicitly set other environments.
-// RUN: not %clang -c --target=aarch64-linux-gnu -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=ERR3
+// RUN: not %clang -### -c --target=aarch64-linux-gnu -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=ERR3
 // ERR3: error: unsupported option '-mabi=pauthtest' for target 'aarch64-unknown-linux-gnu'
-// RUN: %clang -c --target=aarch64-linux-pauthtest -mabi=pauthtest %s
+// RUN: %clang -### -c --target=aarch64-linux-pauthtest -mabi=pauthtest %s
 
 //// The only branch protection option compatible with PAuthABI is BTI.
 // RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=pac-ret %s 2>&1 | \

>From 364ee4ee36942e76861e1e7ec9ff8d46ca76e647 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Thu, 25 Jul 2024 15:12:39 -0700
Subject: [PATCH 212/259] [lld-macho,test] Avoid writing to CWD

Don't write to `filelist` in the current working directory, whigh might
be read-only in some testing environmens.
---
 lld/test/MachO/filelist.s | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/test/MachO/filelist.s b/lld/test/MachO/filelist.s
index 3d4846c4505a9..6a9e54fe874e5 100644
--- a/lld/test/MachO/filelist.s
+++ b/lld/test/MachO/filelist.s
@@ -3,7 +3,7 @@
 ## This test verifies that the paths in -filelist get processed in command-line
 ## order.
 
-# RUN: rm -rf %t; split-file %s %t
+# RUN: rm -rf %t; split-file %s %t && cd %t
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/first.s -o %t/first.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/second.s -o %t/second.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/test.s -o %t/test.o

>From 4ed543d414b6f39fb5302322eecb8abf33c76f89 Mon Sep 17 00:00:00 2001
From: Cyndy Ishida <cyndy_ishida at apple.com>
Date: Thu, 25 Jul 2024 15:26:41 -0700
Subject: [PATCH 213/259] [InstallAPI] Fix early return for scanning
 sub-directories (#100636)

---
 clang/lib/InstallAPI/DirectoryScanner.cpp     |  3 +-
 .../directory-scanning-subdirectories.test    | 61 +++++++++++++++++++
 2 files changed, 63 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/InstallAPI/directory-scanning-subdirectories.test

diff --git a/clang/lib/InstallAPI/DirectoryScanner.cpp b/clang/lib/InstallAPI/DirectoryScanner.cpp
index 8984758e7b446..03a8208c7364e 100644
--- a/clang/lib/InstallAPI/DirectoryScanner.cpp
+++ b/clang/lib/InstallAPI/DirectoryScanner.cpp
@@ -130,7 +130,8 @@ Error DirectoryScanner::scanHeaders(StringRef Path, Library &Lib,
   if (ParentPath.empty())
     ParentPath = Path;
   for (const StringRef Dir : SubDirectories)
-    return scanHeaders(Dir, Lib, Type, BasePath, ParentPath);
+    if (Error Err = scanHeaders(Dir, Lib, Type, BasePath, ParentPath))
+      return Err;
 
   return Error::success();
 }
diff --git a/clang/test/InstallAPI/directory-scanning-subdirectories.test b/clang/test/InstallAPI/directory-scanning-subdirectories.test
new file mode 100644
index 0000000000000..3eac90440fa1e
--- /dev/null
+++ b/clang/test/InstallAPI/directory-scanning-subdirectories.test
@@ -0,0 +1,61 @@
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+; RUN: mkdir -p %t/DstRoot/
+; RUN: cp -r %S/Inputs/LibFoo/* %t/DstRoot/
+
+; RUN: clang-installapi \
+; RUN: -target arm64-apple-macos12 -install_name @rpath/libfoo.dylib \
+; RUN: -current_version 1 -compatibility_version 1 \
+; RUN: -I%t/DstRoot/usr/include -dynamiclib \
+; RUN: -exclude-public-header %t/DstRoot/usr/include/public.h \
+; RUN: %t/DstRoot -o %t/output.tbd 2>&1 | FileCheck %s --allow-empty \
+; RUN: --implicit-check-not=error --implicit-check-not=warning 
+; RUN: llvm-readtapi --compare %t/output.tbd %t/expected.tbd 
+
+
+;--- DstRoot/usr/include/extra/extra.h
+int extra(void);
+
+;--- DstRoot/usr/include/extra/additional/additional.h
+int additional(void);
+
+;--- DstRoot/usr/include/more/more.h
+int more(void);
+
+;--- DstRoot/usr/include/another/another.h
+int another(void);
+
+;--- expected.tbd
+{
+  "main_library": {
+    "exported_symbols": [
+      {
+        "text": {
+          "global": [
+            "_foo", "_additional", "_more",
+            "_another", "_extra"
+          ]
+        }
+      }
+    ],
+    "flags": [
+      {
+        "attributes": [
+          "not_app_extension_safe"
+        ]
+      }
+    ],
+    "install_names": [
+      {
+        "name": "@rpath/libfoo.dylib"
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "12",
+        "target": "arm64-macos"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}

>From 7b1bb2bbebc4705e2f30bb2179636a73696c4656 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Thu, 25 Jul 2024 15:35:03 -0700
Subject: [PATCH 214/259] [Passes] Switch to xxh3_64bits

FNV is slow and the name StableHashing.h might be misleading. Just use
xxh3_64bits, which has been adopted in many places.
---
 llvm/lib/Passes/StandardInstrumentations.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index fc7b82d522bf0..8f2461f40cb00 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -14,7 +14,6 @@
 
 #include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/ADT/Any.h"
-#include "llvm/ADT/StableHashing.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/LazyCallGraph.h"
@@ -44,6 +43,7 @@
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/xxhash.h"
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
@@ -753,28 +753,27 @@ static SmallString<32> getIRFileDisplayName(Any IR) {
   SmallString<32> Result;
   raw_svector_ostream ResultStream(Result);
   const Module *M = unwrapModule(IR);
-  stable_hash NameHash = stable_hash_combine_string(M->getName());
-  unsigned int MaxHashWidth = sizeof(stable_hash) * 8 / 4;
+  uint64_t NameHash = xxh3_64bits(M->getName());
+  unsigned MaxHashWidth = sizeof(uint64_t) * 2;
   write_hex(ResultStream, NameHash, HexPrintStyle::Lower, MaxHashWidth);
   if (unwrapIR<Module>(IR)) {
     ResultStream << "-module";
   } else if (const auto *F = unwrapIR<Function>(IR)) {
     ResultStream << "-function-";
-    stable_hash FunctionNameHash = stable_hash_combine_string(F->getName());
+    auto FunctionNameHash = xxh3_64bits(F->getName());
     write_hex(ResultStream, FunctionNameHash, HexPrintStyle::Lower,
               MaxHashWidth);
   } else if (const auto *C = unwrapIR<LazyCallGraph::SCC>(IR)) {
     ResultStream << "-scc-";
-    stable_hash SCCNameHash = stable_hash_combine_string(C->getName());
+    auto SCCNameHash = xxh3_64bits(C->getName());
     write_hex(ResultStream, SCCNameHash, HexPrintStyle::Lower, MaxHashWidth);
   } else if (const auto *L = unwrapIR<Loop>(IR)) {
     ResultStream << "-loop-";
-    stable_hash LoopNameHash = stable_hash_combine_string(L->getName());
+    auto LoopNameHash = xxh3_64bits(L->getName());
     write_hex(ResultStream, LoopNameHash, HexPrintStyle::Lower, MaxHashWidth);
   } else if (const auto *MF = unwrapIR<MachineFunction>(IR)) {
     ResultStream << "-machine-function-";
-    stable_hash MachineFunctionNameHash =
-        stable_hash_combine_string(MF->getName());
+    auto MachineFunctionNameHash = xxh3_64bits(MF->getName());
     write_hex(ResultStream, MachineFunctionNameHash, HexPrintStyle::Lower,
               MaxHashWidth);
   } else {

>From b27360c346a529affde40150f46b967b616a2144 Mon Sep 17 00:00:00 2001
From: "Stephan T. Lavavej" <stl at nuwen.net>
Date: Thu, 25 Jul 2024 15:45:29 -0700
Subject: [PATCH 215/259] [libc++][test] Fix Clang `-Wunused-variable` warnings
 in time.zone.members/to_sys.pass.cpp (#100504)

---
 .../time.zone.members/to_sys.pass.cpp            | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys.pass.cpp
index 874c3d52e460b..e32b6d523d0ed 100644
--- a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys.pass.cpp
@@ -89,7 +89,7 @@ static void test_nonexistent() {
   TEST_VALIDATE_EXCEPTION(
       std::chrono::nonexistent_local_time,
       [&]([[maybe_unused]] const std::chrono::nonexistent_local_time& e) {
-        std::string_view what =
+        [[maybe_unused]] std::string_view what =
             R"(1986-03-30 02:30:00.000000000 is in a gap between
 1986-03-30 02:00:00 CET and
 1986-03-30 03:00:00 CEST which are both equivalent to
@@ -103,7 +103,7 @@ static void test_nonexistent() {
   TEST_VALIDATE_EXCEPTION(
       std::chrono::nonexistent_local_time,
       [&]([[maybe_unused]] const std::chrono::nonexistent_local_time& e) {
-        std::string_view what =
+        [[maybe_unused]] std::string_view what =
             R"(1986-03-30 02:30:00.000000 is in a gap between
 1986-03-30 02:00:00 CET and
 1986-03-30 03:00:00 CEST which are both equivalent to
@@ -117,7 +117,7 @@ static void test_nonexistent() {
   TEST_VALIDATE_EXCEPTION(
       std::chrono::nonexistent_local_time,
       [&]([[maybe_unused]] const std::chrono::nonexistent_local_time& e) {
-        std::string_view what =
+        [[maybe_unused]] std::string_view what =
             R"(1986-03-30 02:30:00.000 is in a gap between
 1986-03-30 02:00:00 CET and
 1986-03-30 03:00:00 CEST which are both equivalent to
@@ -131,7 +131,7 @@ static void test_nonexistent() {
   TEST_VALIDATE_EXCEPTION(
       std::chrono::nonexistent_local_time,
       [&]([[maybe_unused]] const std::chrono::nonexistent_local_time& e) {
-        std::string_view what =
+        [[maybe_unused]] std::string_view what =
             R"(1986-03-30 02:30:00 is in a gap between
 1986-03-30 02:00:00 CET and
 1986-03-30 03:00:00 CEST which are both equivalent to
@@ -173,7 +173,7 @@ static void test_ambiguous() {
   TEST_VALIDATE_EXCEPTION(
       std::chrono::ambiguous_local_time,
       [&]([[maybe_unused]] const std::chrono::ambiguous_local_time& e) {
-        std::string_view what =
+        [[maybe_unused]] std::string_view what =
             R"(1986-09-28 02:30:00.000000000 is ambiguous.  It could be
 1986-09-28 02:30:00.000000000 CEST == 1986-09-28 00:30:00.000000000 UTC or
 1986-09-28 02:30:00.000000000 CET == 1986-09-28 01:30:00.000000000 UTC)";
@@ -186,7 +186,7 @@ static void test_ambiguous() {
   TEST_VALIDATE_EXCEPTION(
       std::chrono::ambiguous_local_time,
       [&]([[maybe_unused]] const std::chrono::ambiguous_local_time& e) {
-        std::string_view what =
+        [[maybe_unused]] std::string_view what =
             R"(1986-09-28 02:30:00.000000 is ambiguous.  It could be
 1986-09-28 02:30:00.000000 CEST == 1986-09-28 00:30:00.000000 UTC or
 1986-09-28 02:30:00.000000 CET == 1986-09-28 01:30:00.000000 UTC)";
@@ -199,7 +199,7 @@ static void test_ambiguous() {
   TEST_VALIDATE_EXCEPTION(
       std::chrono::ambiguous_local_time,
       [&]([[maybe_unused]] const std::chrono::ambiguous_local_time& e) {
-        std::string_view what =
+        [[maybe_unused]] std::string_view what =
             R"(1986-09-28 02:30:00.000 is ambiguous.  It could be
 1986-09-28 02:30:00.000 CEST == 1986-09-28 00:30:00.000 UTC or
 1986-09-28 02:30:00.000 CET == 1986-09-28 01:30:00.000 UTC)";
@@ -212,7 +212,7 @@ static void test_ambiguous() {
   TEST_VALIDATE_EXCEPTION(
       std::chrono::ambiguous_local_time,
       [&]([[maybe_unused]] const std::chrono::ambiguous_local_time& e) {
-        std::string_view what =
+        [[maybe_unused]] std::string_view what =
             R"(1986-09-28 02:30:00 is ambiguous.  It could be
 1986-09-28 02:30:00 CEST == 1986-09-28 00:30:00 UTC or
 1986-09-28 02:30:00 CET == 1986-09-28 01:30:00 UTC)";

>From e788788c42fcbed5077b13f8bb88a81a832ab6eb Mon Sep 17 00:00:00 2001
From: Brad Smith <brad at comstyle.com>
Date: Thu, 25 Jul 2024 18:57:14 -0400
Subject: [PATCH 216/259] [clang][Sema] Add support for OpenBSD's syslog format
 attribute (#97366)

---
 clang/docs/ReleaseNotes.rst     | 2 ++
 clang/include/clang/Sema/Sema.h | 1 +
 clang/lib/Sema/SemaChecking.cpp | 5 +++--
 clang/lib/Sema/SemaDeclAttr.cpp | 4 ++--
 clang/test/Sema/attr-format.c   | 7 +++++++
 5 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 9f5ee9c82ab0a..5dddd8f1c5af5 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -111,6 +111,8 @@ Attribute Changes in Clang
 - Clang now disallows more than one ``__attribute__((ownership_returns(class, idx)))`` with
   different class names attached to one function.
 
+- Introduced a new format attribute ``__attribute__((format(syslog, 1, 2)))`` from OpenBSD.
+
 Improvements to Clang's diagnostics
 -----------------------------------
 
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 7bfdaaae45a93..2ec6367eccea0 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -2214,6 +2214,7 @@ class Sema final : public SemaBase {
     FST_FreeBSDKPrintf,
     FST_OSTrace,
     FST_OSLog,
+    FST_Syslog,
     FST_Unknown
   };
   static FormatStringType GetFormatStringType(const FormatAttr *Format);
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index bde92e04b6b83..07a34fda494d8 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -6030,7 +6030,7 @@ static const Expr *maybeConstEvalStringLiteral(ASTContext &Context,
 Sema::FormatStringType Sema::GetFormatStringType(const FormatAttr *Format) {
   return llvm::StringSwitch<FormatStringType>(Format->getType()->getName())
       .Case("scanf", FST_Scanf)
-      .Cases("printf", "printf0", FST_Printf)
+      .Cases("printf", "printf0", "syslog", FST_Printf)
       .Cases("NSString", "CFString", FST_NSString)
       .Case("strftime", FST_Strftime)
       .Case("strfmon", FST_Strfmon)
@@ -6124,6 +6124,7 @@ bool Sema::CheckFormatArguments(ArrayRef<const Expr *> Args,
     case FST_Kprintf:
     case FST_FreeBSDKPrintf:
     case FST_Printf:
+    case FST_Syslog:
       Diag(FormatLoc, diag::note_format_security_fixit)
         << FixItHint::CreateInsertion(FormatLoc, "\"%s\", ");
       break;
@@ -7860,7 +7861,7 @@ static void CheckFormatString(
 
   if (Type == Sema::FST_Printf || Type == Sema::FST_NSString ||
       Type == Sema::FST_FreeBSDKPrintf || Type == Sema::FST_OSLog ||
-      Type == Sema::FST_OSTrace) {
+      Type == Sema::FST_OSTrace || Type == Sema::FST_Syslog) {
     CheckPrintfHandler H(
         S, FExpr, OrigFormatExpr, Type, firstDataArg, numDataArgs,
         (Type == Sema::FST_NSString || Type == Sema::FST_OSTrace), Str, APK,
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 39675422e3f9f..300bd8967790f 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3407,8 +3407,8 @@ static FormatAttrKind getFormatAttrKind(StringRef Format) {
       // Otherwise, check for supported formats.
       .Cases("scanf", "printf", "printf0", "strfmon", SupportedFormat)
       .Cases("cmn_err", "vcmn_err", "zcmn_err", SupportedFormat)
-      .Case("kprintf", SupportedFormat)         // OpenBSD.
-      .Case("freebsd_kprintf", SupportedFormat) // FreeBSD.
+      .Cases("kprintf", "syslog", SupportedFormat) // OpenBSD.
+      .Case("freebsd_kprintf", SupportedFormat)    // FreeBSD.
       .Case("os_trace", SupportedFormat)
       .Case("os_log", SupportedFormat)
 
diff --git a/clang/test/Sema/attr-format.c b/clang/test/Sema/attr-format.c
index 1f4c864d4f78b..5a8b1ac9eca5c 100644
--- a/clang/test/Sema/attr-format.c
+++ b/clang/test/Sema/attr-format.c
@@ -99,3 +99,10 @@ void forward_fixed(const char *fmt, _Bool b, char i, short j, int k, float l, do
   a(fmt, b, i, j, k, l, m);
 }
 
+// OpenBSD
+// same as format(printf(...))...
+void a2(const char *a, ...) __attribute__((format(syslog, 1, 2)));    // no-error
+void b2(const char *a, ...) __attribute__((format(syslog, 1, 1)));    // expected-error {{'format' attribute parameter 3 is out of bounds}}
+void c2(const char *a, ...) __attribute__((format(syslog, 0, 2)));    // expected-error {{'format' attribute parameter 2 is out of bounds}}
+void d2(const char *a, int c) __attribute__((format(syslog, 1, 2)));  // expected-warning {{GCC requires a function with the 'format' attribute to be variadic}}
+void e2(char *str, int c, ...) __attribute__((format(syslog, 2, 3))); // expected-error {{format argument not a string type}}

>From 0d4f9484bcd51d1ed07723c7b762e0f33d9f448f Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail at bennani.ma>
Date: Thu, 25 Jul 2024 16:19:05 -0700
Subject: [PATCH 217/259] [lldb/Plugins] Fix build failure on windows following
 2914a4b88837

This patch tries to fix the following build failure on windows:

https://lab.llvm.org/buildbot/#/builders/141/builds/1083

This started happening following 2914a4b88837, and it seems to be caused
by some special `#include` ordering for the lldb-python header on Windows.

Signed-off-by: Med Ismail Bennani <ismail at bennani.ma>
---
 .../ScriptedPlatformPythonInterface.cpp             |  1 -
 .../ScriptedPlatformPythonInterface.h               |  8 ++++++++
 .../ScriptedProcessPythonInterface.h                |  8 ++++++++
 .../Python/Interfaces/ScriptedPythonInterface.cpp   | 11 ++++++++---
 .../Python/Interfaces/ScriptedPythonInterface.h     |  7 +++----
 .../ScriptedThreadPlanPythonInterface.cpp           | 13 ++++++++-----
 .../Interfaces/ScriptedThreadPythonInterface.cpp    | 11 ++++++++---
 .../Python/ScriptInterpreterPython.cpp              | 11 ++++++++---
 8 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.cpp
index 19d2ee2c717d8..2b701f85010ed 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.cpp
@@ -12,7 +12,6 @@
 // LLDB Python header must be included first
 #include "../../lldb-python.h"
 //clang-format on
-
 #endif
 
 #include "lldb/Core/PluginManager.h"
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h
index 01ee40a5a197c..c4b4e45b69e84 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h
@@ -9,6 +9,14 @@
 #ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDPLATFORMPYTHONINTERFACE_H
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDPLATFORMPYTHONINTERFACE_H
 
+#if LLDB_ENABLE_PYTHON
+
+// clang-format off
+// LLDB Python header must be included first
+#include "../../lldb-python.h"
+//clang-format on
+#endif
+
 #include "lldb/Host/Config.h"
 #include "lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h"
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.h
index bb27734739f43..996cbd63e0897 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.h
@@ -9,6 +9,14 @@
 #ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDPROCESSPYTHONINTERFACE_H
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDPROCESSPYTHONINTERFACE_H
 
+#if LLDB_ENABLE_PYTHON
+
+// clang-format off
+// LLDB Python header must be included first
+#include "../../lldb-python.h"
+//clang-format on
+#endif
+
 #include "lldb/Host/Config.h"
 #include "lldb/Interpreter/Interfaces/ScriptedProcessInterface.h"
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
index 699412e437a1a..7bb1bfedf2f29 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
@@ -6,15 +6,20 @@
 //
 //===----------------------------------------------------------------------===//
 
+#if LLDB_ENABLE_PYTHON
+
+// clang-format off
+// LLDB Python header must be included first
+#include "../../lldb-python.h"
+//clang-format on
+#endif
+
 #include "lldb/Host/Config.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/lldb-enumerations.h"
 
 #if LLDB_ENABLE_PYTHON
 
-// LLDB Python header must be included first
-#include "../lldb-python.h"
-
 #include "../ScriptInterpreterPythonImpl.h"
 #include "ScriptedPythonInterface.h"
 #include <optional>
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
index 74b89a6f6a883..d92e45e265f68 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
@@ -16,6 +16,9 @@
 //clang-format on
 #endif
 
+#include "lldb/Host/Config.h"
+#include "lldb/Interpreter/Interfaces/ScriptedInterface.h"
+#include "lldb/Utility/DataBufferHeap.h"
 
 #include <optional>
 #include <sstream>
@@ -23,10 +26,6 @@
 #include <type_traits>
 #include <utility>
 
-#include "lldb/Host/Config.h"
-#include "lldb/Interpreter/Interfaces/ScriptedInterface.h"
-#include "lldb/Utility/DataBufferHeap.h"
-
 #if LLDB_ENABLE_PYTHON
 
 #include "../PythonDataObjects.h"
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.cpp
index 5f1c7da71bd52..d83ab280d9772 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.cpp
@@ -6,17 +6,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "lldb/Core/PluginManager.h"
-#include "lldb/Host/Config.h"
-#include "lldb/Utility/Log.h"
-#include "lldb/lldb-enumerations.h"
-
 #if LLDB_ENABLE_PYTHON
 
 // clang-format off
 // LLDB Python header must be included first
 #include "../../lldb-python.h"
 //clang-format on
+#endif
+
+#include "lldb/Core/PluginManager.h"
+#include "lldb/Host/Config.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/lldb-enumerations.h"
+
+#if LLDB_ENABLE_PYTHON
 
 #include "../../SWIGPythonBridge.h"
 #include "../../ScriptInterpreterPythonImpl.h"
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
index 8af89d761764b..baf15f76c83ed 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
@@ -6,6 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#if LLDB_ENABLE_PYTHON
+
+// clang-format off
+// LLDB Python header must be included first
+#include "../../lldb-python.h"
+//clang-format on
+#endif
+
 #include "lldb/Host/Config.h"
 #include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/Log.h"
@@ -13,9 +21,6 @@
 
 #if LLDB_ENABLE_PYTHON
 
-// LLDB Python header must be included first
-#include "../lldb-python.h"
-
 #include "../SWIGPythonBridge.h"
 #include "../ScriptInterpreterPythonImpl.h"
 #include "ScriptedThreadPythonInterface.h"
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index b1a69c9207649..d056bbd03b7c5 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -6,14 +6,19 @@
 //
 //===----------------------------------------------------------------------===//
 
+#if LLDB_ENABLE_PYTHON
+
+// clang-format off
+// LLDB Python header must be included first
+#include "../../lldb-python.h"
+//clang-format on
+#endif
+
 #include "lldb/Host/Config.h"
 #include "lldb/lldb-enumerations.h"
 
 #if LLDB_ENABLE_PYTHON
 
-// LLDB Python header must be included first
-#include "lldb-python.h"
-
 #include "Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h"
 #include "Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h"
 #include "Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.h"

>From 329ee76fdde16e212df61cd4c9953b8fb332f652 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Thu, 25 Jul 2024 16:36:43 -0700
Subject: [PATCH 218/259] [MC][NFC] Drop unused
 MCDecodedPseudoProbeInlineTree::ChildrenToProcess (#100576)

The usage was removed in 3f97016857b0305294f3a55ea220884fb50ce033.

Results in a slight peak RSS reduction in
`perf2bolt --profile-use-pseudo-probes` from 17.24 to 16.85 GiB.
---
 llvm/include/llvm/MC/MCPseudoProbe.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index 5344dea4141b3..54a7f832e5008 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -280,8 +280,6 @@ class MCDecodedPseudoProbeInlineTree
                                          MCDecodedPseudoProbeInlineTree> {
 public:
   InlineSite ISite;
-  // Used for decoding
-  uint32_t ChildrenToProcess = 0;
 
   MCDecodedPseudoProbeInlineTree() = default;
   MCDecodedPseudoProbeInlineTree(const InlineSite &Site) : ISite(Site){};

>From 026972af9c3cbd85b654b67a5b5c3b754a78a997 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Thu, 25 Jul 2024 16:45:09 -0700
Subject: [PATCH 219/259] [ELF] Remove obsoleted comment after #99567

---
 lld/ELF/ScriptLexer.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/lld/ELF/ScriptLexer.cpp b/lld/ELF/ScriptLexer.cpp
index 65055086d6bc2..40c4637004352 100644
--- a/lld/ELF/ScriptLexer.cpp
+++ b/lld/ELF/ScriptLexer.cpp
@@ -20,11 +20,6 @@
 // in various corner cases. We do not care much about efficiency because
 // the time spent in parsing linker scripts is usually negligible.
 //
-// Our grammar of the linker script is LL(2), meaning that it needs at
-// most two-token lookahead to parse. The only place we need two-token
-// lookahead is labels in version scripts, where we need to parse "local :"
-// as if "local:".
-//
 // Overall, this lexer works fine for most linker scripts. There might
 // be room for improving compatibility, but that's probably not at the
 // top of our todo list.

>From 7432ad6af551d7b49f54ae2bb0693192ded987c6 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Thu, 25 Jul 2024 19:55:37 -0400
Subject: [PATCH 220/259] [LV][VP][NFC]Add tests for safe store/load
 forwarding/dependence distance.

Reviewers: fhahn

Reviewed By: fhahn

Pull Request: https://github.com/llvm/llvm-project/pull/100635
---
 ...e-force-tail-with-evl-safe-dep-distance.ll | 560 ++++++++++++++++++
 1 file changed, 560 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
new file mode 100644
index 0000000000000..2dd47d5c1ea8a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
@@ -0,0 +1,560 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
+
+; Dependence distance between read and write is greater than the trip
+; count of the loop.  Thus, values written are never read for any
+; valid vectorization of the loop.
+define void @test(ptr %p) {
+; IF-EVL-LABEL: @test(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 200, [[TMP2]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = sub i64 200, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP5]], i32 2, i1 true)
+; IF-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[TMP7]], 200
+; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP10]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP11]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 8 [[TMP12]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP6]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[LOOP:%.*]]
+; IF-EVL:       loop:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; IF-EVL-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 8
+; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 200
+; IF-EVL-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 8
+; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       exit:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @test(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[LOOP:%.*]]
+; NO-VP:       loop:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]]
+; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 8
+; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 200
+; NO-VP-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 8
+; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; NO-VP:       exit:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 8
+  %offset = add i64 %iv, 200
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 8
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 199
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Dependence distance is less than trip count, thus we must prove that
+; chosen VF guaranteed to be less than dependence distance.
+define void @test_may_clobber1(ptr %p) {
+; IF-EVL-LABEL: @test_may_clobber1(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
+; IF-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32
+; IF-EVL-NEXT:    [[TMP3:%.*]] = add i64 [[TMP0]], 100
+; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
+; IF-EVL-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[LOOP:%.*]]
+; IF-EVL:       loop:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; IF-EVL-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 100
+; IF-EVL-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL:       exit:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @test_may_clobber1(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32
+; NO-VP-NEXT:    [[TMP3:%.*]] = add i64 [[TMP0]], 100
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
+; NO-VP-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; NO-VP-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[LOOP:%.*]]
+; NO-VP:       loop:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 100
+; NO-VP-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       exit:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 100
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 199
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_may_clobber2(ptr %p) {
+; IF-EVL-LABEL: @test_may_clobber2(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br label [[LOOP:%.*]]
+; IF-EVL:       loop:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]]
+; IF-EVL-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 9
+; IF-EVL-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; IF-EVL:       exit:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @test_may_clobber2(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[LOOP:%.*]]
+; NO-VP:       loop:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]]
+; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 9
+; NO-VP-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; NO-VP:       exit:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 9
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 199
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_may_clobber3(ptr %p) {
+; IF-EVL-LABEL: @test_may_clobber3(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
+; IF-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32
+; IF-EVL-NEXT:    [[TMP3:%.*]] = add i64 [[TMP0]], 10
+; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
+; IF-EVL-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[LOOP:%.*]]
+; IF-EVL:       loop:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; IF-EVL-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 10
+; IF-EVL-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL:       exit:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @test_may_clobber3(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32
+; NO-VP-NEXT:    [[TMP3:%.*]] = add i64 [[TMP0]], 10
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
+; NO-VP-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; NO-VP-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; NO-VP-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[LOOP:%.*]]
+; NO-VP:       loop:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 10
+; NO-VP-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; NO-VP:       exit:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 10
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 199
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Trviailly no overlap due to maximum possible value of VLEN and LMUL
+define void @trivial_due_max_vscale(ptr %p) {
+; IF-EVL-LABEL: @trivial_due_max_vscale(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 200, [[TMP2]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = sub i64 200, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP5]], i32 2, i1 true)
+; IF-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP9]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[TMP7]], 8192
+; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP10]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP11]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP12]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP6]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[LOOP:%.*]]
+; IF-EVL:       loop:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; IF-EVL-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 8192
+; IF-EVL-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; IF-EVL:       exit:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @trivial_due_max_vscale(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[LOOP:%.*]]
+; NO-VP:       loop:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]]
+; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 8192
+; NO-VP-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
+; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; NO-VP:       exit:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 8192
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 199
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Dependence distance could be violated via LMUL>=2 or interleaving
+define void @no_high_lmul_or_interleave(ptr %p) {
+; IF-EVL-LABEL: @no_high_lmul_or_interleave(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; IF-EVL-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], <i64 3001, i64 3001, i64 3001, i64 3001>
+; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[TMP3]], i32 32, <4 x i1> [[TMP1]], <4 x i64> poison)
+; IF-EVL-NEXT:    [[TMP4:%.*]] = add i64 [[TMP0]], 1024
+; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP4]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
+; IF-EVL-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_MASKED_LOAD]], ptr [[TMP6]], i32 32, <4 x i1> [[TMP1]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; IF-EVL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3004
+; IF-EVL-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3004, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[LOOP:%.*]]
+; IF-EVL:       loop:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
+; IF-EVL-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 1024
+; IF-EVL-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 3001
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+; IF-EVL:       exit:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @no_high_lmul_or_interleave(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[LOOP:%.*]]
+; NO-VP:       loop:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]]
+; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
+; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 1024
+; NO-VP-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
+; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 32
+; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 3001
+; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; NO-VP:       exit:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 1024
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 3001
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @non-power-2-storeloadforward(ptr %A) {
+; IF-EVL-LABEL: @non-power-2-storeloadforward(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 16, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = add nsw i64 [[IV]], -3
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = add nsw i64 [[IV]], 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP3]], [[TMP1]]
+; IF-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX5]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; IF-EVL-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[IV_NEXT]] to i32
+; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], 128
+; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; IF-EVL:       for.end:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @non-power-2-storeloadforward(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 16, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP0:%.*]] = add nsw i64 [[IV]], -3
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = add nsw i64 [[IV]], 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
+; NO-VP-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP3]], [[TMP1]]
+; NO-VP-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX5]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; NO-VP-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[IV_NEXT]] to i32
+; NO-VP-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], 128
+; NO-VP-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; NO-VP:       for.end:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 16, %entry ], [ %iv.next, %for.body ]
+  %0 = add nsw i64 %iv, -3
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %0
+  %1 = load i32, ptr %arrayidx, align 4
+  %2 = add nsw i64 %iv, 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %2
+  %3 = load i32, ptr %arrayidx2, align 4
+  %add3 = add nsw i32 %3, %1
+  %arrayidx5 = getelementptr inbounds i32, ptr %A, i64 %iv
+  store i32 %add3, ptr %arrayidx5, align 4
+  %iv.next = add i64 %iv, 1
+  %lftr.wideiv = trunc i64 %iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+

>From 754dc9ff5ada14afef0d7981192d44f249f63e27 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 26 Jul 2024 07:56:12 +0800
Subject: [PATCH 221/259] [RISCV] Move exact VLEN VLMAX transform to
 RISCVVectorPeephole (#100551)

We can teach RISCVVectorPeephole to detect when an AVL is equal to the
VLMAX when the exact VLEN is known and use the VLMAX sentinel instead,
and in doing so remove the need for getVLOp in RISCVISelLowering. This
keeps all the VLMAX logic in one place.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 35 +++----------
 llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp | 52 ++++++++++++++-----
 llvm/test/CodeGen/RISCV/rvv/pr83017.ll        |  6 +--
 llvm/test/CodeGen/RISCV/rvv/pr90559.ll        |  6 +--
 4 files changed, 52 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d40d4997d7614..0339b302fb218 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2758,19 +2758,6 @@ static SDValue getAllOnesMask(MVT VecVT, SDValue VL, const SDLoc &DL,
   return DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
 }
 
-static SDValue getVLOp(uint64_t NumElts, MVT ContainerVT, const SDLoc &DL,
-                       SelectionDAG &DAG, const RISCVSubtarget &Subtarget) {
-  // If we know the exact VLEN, and our VL is exactly equal to VLMAX,
-  // canonicalize the representation.  InsertVSETVLI will pick the immediate
-  // encoding later if profitable.
-  const auto [MinVLMAX, MaxVLMAX] =
-      RISCVTargetLowering::computeVLMAXBounds(ContainerVT, Subtarget);
-  if (MinVLMAX == MaxVLMAX && NumElts == MinVLMAX)
-    return DAG.getRegister(RISCV::X0, Subtarget.getXLenVT());
-
-  return DAG.getConstant(NumElts, DL, Subtarget.getXLenVT());
-}
-
 static std::pair<SDValue, SDValue>
 getDefaultScalableVLOps(MVT VecVT, const SDLoc &DL, SelectionDAG &DAG,
                         const RISCVSubtarget &Subtarget) {
@@ -2784,7 +2771,7 @@ static std::pair<SDValue, SDValue>
 getDefaultVLOps(uint64_t NumElts, MVT ContainerVT, const SDLoc &DL,
                 SelectionDAG &DAG, const RISCVSubtarget &Subtarget) {
   assert(ContainerVT.isScalableVector() && "Expecting scalable container type");
-  SDValue VL = getVLOp(NumElts, ContainerVT, DL, DAG, Subtarget);
+  SDValue VL = DAG.getConstant(NumElts, DL, Subtarget.getXLenVT());
   SDValue Mask = getAllOnesMask(ContainerVT, VL, DL, DAG);
   return {Mask, VL};
 }
@@ -9427,8 +9414,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     MVT VT = Op->getSimpleValueType(0);
     MVT ContainerVT = getContainerForFixedLengthVector(VT);
 
-    SDValue VL = getVLOp(VT.getVectorNumElements(), ContainerVT, DL, DAG,
-                         Subtarget);
+    SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
     SDValue IntID = DAG.getTargetConstant(VlsegInts[NF - 2], DL, XLenVT);
     auto *Load = cast<MemIntrinsicSDNode>(Op);
     SmallVector<EVT, 9> ContainerVTs(NF, ContainerVT);
@@ -9507,8 +9493,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     MVT VT = Op->getOperand(2).getSimpleValueType();
     MVT ContainerVT = getContainerForFixedLengthVector(VT);
 
-    SDValue VL = getVLOp(VT.getVectorNumElements(), ContainerVT, DL, DAG,
-                         Subtarget);
+    SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
     SDValue IntID = DAG.getTargetConstant(VssegInts[NF - 2], DL, XLenVT);
     SDValue Ptr = Op->getOperand(NF + 2);
 
@@ -9974,7 +9959,7 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     // Set the vector length to only the number of elements we care about. Note
     // that for slideup this includes the offset.
     unsigned EndIndex = OrigIdx + SubVecVT.getVectorNumElements();
-    SDValue VL = getVLOp(EndIndex, ContainerVT, DL, DAG, Subtarget);
+    SDValue VL = DAG.getConstant(EndIndex, DL, XLenVT);
 
     // Use tail agnostic policy if we're inserting over Vec's tail.
     unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
@@ -10211,8 +10196,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
         getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget).first;
     // Set the vector length to only the number of elements we care about. This
     // avoids sliding down elements we're going to discard straight away.
-    SDValue VL = getVLOp(SubVecVT.getVectorNumElements(), ContainerVT, DL, DAG,
-                         Subtarget);
+    SDValue VL = DAG.getConstant(SubVecVT.getVectorNumElements(), DL, XLenVT);
     SDValue SlidedownAmt = DAG.getConstant(OrigIdx, DL, XLenVT);
     SDValue Slidedown =
         getVSlidedown(DAG, Subtarget, DL, ContainerVT,
@@ -10287,8 +10271,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
   SDValue SlidedownAmt = DAG.getElementCount(DL, XLenVT, RemIdx);
   auto [Mask, VL] = getDefaultScalableVLOps(InterSubVT, DL, DAG, Subtarget);
   if (SubVecVT.isFixedLengthVector())
-    VL = getVLOp(SubVecVT.getVectorNumElements(), InterSubVT, DL, DAG,
-                 Subtarget);
+    VL = DAG.getConstant(SubVecVT.getVectorNumElements(), DL, XLenVT);
   SDValue Slidedown =
       getVSlidedown(DAG, Subtarget, DL, InterSubVT, DAG.getUNDEF(InterSubVT),
                     Vec, SlidedownAmt, Mask, VL);
@@ -10668,7 +10651,7 @@ RISCVTargetLowering::lowerFixedLengthVectorLoadToRVV(SDValue Op,
     return DAG.getMergeValues({Result, NewLoad.getValue(1)}, DL);
   }
 
-  SDValue VL = getVLOp(VT.getVectorNumElements(), ContainerVT, DL, DAG, Subtarget);
+  SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
 
   bool IsMaskOp = VT.getVectorElementType() == MVT::i1;
   SDValue IntID = DAG.getTargetConstant(
@@ -10715,7 +10698,6 @@ RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op,
   SDValue NewValue =
       convertToScalableVector(ContainerVT, StoreVal, DAG, Subtarget);
 
-
   // If we know the exact VLEN and our fixed length vector completely fills
   // the container, use a whole register store instead.
   const auto [MinVLMAX, MaxVLMAX] =
@@ -10728,8 +10710,7 @@ RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op,
                         MMO->getFlags(), MMO->getAAInfo());
   }
 
-  SDValue VL = getVLOp(VT.getVectorNumElements(), ContainerVT, DL, DAG,
-                       Subtarget);
+  SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
 
   bool IsMaskOp = VT.getVectorElementType() == MVT::i1;
   SDValue IntID = DAG.getTargetConstant(
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index b083e64cfc8d7..f328c55e1d3ba 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -47,6 +47,7 @@ class RISCVVectorPeephole : public MachineFunctionPass {
   const TargetInstrInfo *TII;
   MachineRegisterInfo *MRI;
   const TargetRegisterInfo *TRI;
+  const RISCVSubtarget *ST;
   RISCVVectorPeephole() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -64,6 +65,7 @@ class RISCVVectorPeephole : public MachineFunctionPass {
   bool convertVMergeToVMv(MachineInstr &MI) const;
 
   bool isAllOnesMask(const MachineInstr *MaskDef) const;
+  std::optional<unsigned> getConstant(const MachineOperand &VL) const;
 
   /// Maps uses of V0 to the corresponding def of V0.
   DenseMap<const MachineInstr *, const MachineInstr *> V0Defs;
@@ -76,13 +78,44 @@ char RISCVVectorPeephole::ID = 0;
 INITIALIZE_PASS(RISCVVectorPeephole, DEBUG_TYPE, "RISC-V Fold Masks", false,
                 false)
 
-// If an AVL is a VLENB that's possibly scaled to be equal to VLMAX, convert it
-// to the VLMAX sentinel value.
+/// Check if an operand is an immediate or a materialized ADDI $x0, imm.
+std::optional<unsigned>
+RISCVVectorPeephole::getConstant(const MachineOperand &VL) const {
+  if (VL.isImm())
+    return VL.getImm();
+
+  MachineInstr *Def = MRI->getVRegDef(VL.getReg());
+  if (!Def || Def->getOpcode() != RISCV::ADDI ||
+      Def->getOperand(1).getReg() != RISCV::X0)
+    return std::nullopt;
+  return Def->getOperand(2).getImm();
+}
+
+/// Convert AVLs that are known to be VLMAX to the VLMAX sentinel.
 bool RISCVVectorPeephole::convertToVLMAX(MachineInstr &MI) const {
   if (!RISCVII::hasVLOp(MI.getDesc().TSFlags) ||
       !RISCVII::hasSEWOp(MI.getDesc().TSFlags))
     return false;
+
+  auto LMUL = RISCVVType::decodeVLMUL(RISCVII::getLMul(MI.getDesc().TSFlags));
+  // Fixed-point value, denominator=8
+  unsigned LMULFixed = LMUL.second ? (8 / LMUL.first) : 8 * LMUL.first;
+  unsigned Log2SEW = MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
+  // A Log2SEW of 0 is an operation on mask registers only
+  unsigned SEW = Log2SEW ? 1 << Log2SEW : 8;
+  assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW");
+  assert(8 * LMULFixed / SEW > 0);
+
+  // If the exact VLEN is known then we know VLMAX, check if the AVL == VLMAX.
   MachineOperand &VL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc()));
+  if (auto VLen = ST->getRealVLen(), AVL = getConstant(VL);
+      VLen && AVL && (*VLen * LMULFixed) / SEW == *AVL * 8) {
+    VL.ChangeToImmediate(RISCV::VLMaxSentinel);
+    return true;
+  }
+
+  // If an AVL is a VLENB that's possibly scaled to be equal to VLMAX, convert
+  // it to the VLMAX sentinel value.
   if (!VL.isReg())
     return false;
   MachineInstr *Def = MRI->getVRegDef(VL.getReg());
@@ -105,15 +138,6 @@ bool RISCVVectorPeephole::convertToVLMAX(MachineInstr &MI) const {
   if (!Def || Def->getOpcode() != RISCV::PseudoReadVLENB)
     return false;
 
-  auto LMUL = RISCVVType::decodeVLMUL(RISCVII::getLMul(MI.getDesc().TSFlags));
-  // Fixed-point value, denominator=8
-  unsigned LMULFixed = LMUL.second ? (8 / LMUL.first) : 8 * LMUL.first;
-  unsigned Log2SEW = MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
-  // A Log2SEW of 0 is an operation on mask registers only
-  unsigned SEW = Log2SEW ? 1 << Log2SEW : 8;
-  assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW");
-  assert(8 * LMULFixed / SEW > 0);
-
   // AVL = (VLENB * Scale)
   //
   // VLMAX = (VLENB * 8 * LMUL) / SEW
@@ -302,11 +326,11 @@ bool RISCVVectorPeephole::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   // Skip if the vector extension is not enabled.
-  const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
-  if (!ST.hasVInstructions())
+  ST = &MF.getSubtarget<RISCVSubtarget>();
+  if (!ST->hasVInstructions())
     return false;
 
-  TII = ST.getInstrInfo();
+  TII = ST->getInstrInfo();
   MRI = &MF.getRegInfo();
   TRI = MRI->getTargetRegisterInfo();
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr83017.ll b/llvm/test/CodeGen/RISCV/rvv/pr83017.ll
index 3719a2ad994d6..beca480378a35 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr83017.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr83017.ll
@@ -35,11 +35,11 @@ define void @aliasing(ptr %p) {
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vs1r.v v8, (a2)
-; CHECK-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vs4r.v v12, (a0)
 ; CHECK-NEXT:    addi a2, a0, 64
 ; CHECK-NEXT:    vs1r.v v8, (a2)
+; CHECK-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vs4r.v v8, (a0)
 ; CHECK-NEXT:    sw a1, 84(a0)
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i64 84
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr90559.ll b/llvm/test/CodeGen/RISCV/rvv/pr90559.ll
index 8d330b12055ae..7e109f307c4a5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr90559.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr90559.ll
@@ -32,11 +32,11 @@ define void @f(ptr %p) vscale_range(2,2) {
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vs1r.v v8, (a2)
-; CHECK-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vs4r.v v12, (a0)
 ; CHECK-NEXT:    addi a2, a0, 64
 ; CHECK-NEXT:    vs1r.v v8, (a2)
+; CHECK-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vs4r.v v8, (a0)
 ; CHECK-NEXT:    sw a1, 84(a0)
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i64 84

>From e8504cb0c933b75d0e31a627d3bd4c0e37e042e1 Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail at bennani.ma>
Date: Thu, 25 Jul 2024 17:11:36 -0700
Subject: [PATCH 222/259] [lldb] Revert scripting template list patches
 (#100673)

Reverts https://github.com/llvm/llvm-project/pull/97273 since it broke
the windows bot:

https://lab.llvm.org/buildbot/#/builders/141/builds/1025/steps/4/logs/stdio
---
 lldb/include/lldb/Core/PluginManager.h        |  20 ---
 .../Interfaces/ScriptedInterface.h            |   7 -
 .../Interfaces/ScriptedInterfaceUsages.h      |  43 ------
 lldb/include/lldb/lldb-private-interfaces.h   |   3 -
 .../Commands/CommandObjectScripting.cpp       | 126 +-----------------
 lldb/source/Commands/Options.td               |   6 -
 lldb/source/Core/PluginManager.cpp            |  65 ---------
 lldb/source/Interpreter/CMakeLists.txt        |   4 -
 .../Interpreter/Interfaces/CMakeLists.txt     |  10 --
 .../Interfaces/ScriptedInterfaceUsages.cpp    |  37 -----
 .../Python/Interfaces/CMakeLists.txt          |  10 +-
 .../OperatingSystemPythonInterface.cpp        |  25 +---
 .../OperatingSystemPythonInterface.h          |  26 +---
 .../CMakeLists.txt                            |  16 ---
 .../ScriptedPlatformPythonInterface.cpp       |  31 +----
 .../ScriptedPlatformPythonInterface.h         |  25 +---
 .../CMakeLists.txt                            |  16 ---
 .../ScriptedProcessPythonInterface.cpp        |  38 +-----
 .../ScriptedProcessPythonInterface.h          |  26 +---
 .../CMakeLists.txt                            |  16 ---
 .../Interfaces/ScriptedPythonInterface.cpp    |  11 +-
 .../Interfaces/ScriptedPythonInterface.h      |  13 +-
 .../ScriptedThreadPlanPythonInterface.cpp     |  33 +----
 .../ScriptedThreadPlanPythonInterface.h       |  27 +---
 .../CMakeLists.txt                            |  16 ---
 .../ScriptedThreadPythonInterface.cpp         |  11 +-
 .../ScriptedThreadPythonInterface.h           |   7 -
 .../Python/PythonDataObjects.h                |   9 +-
 .../Python/SWIGPythonBridge.h                 |  15 +--
 .../Python/ScriptInterpreterPython.cpp        |  19 +--
 .../Python/ScriptInterpreterPython.h          |   8 --
 .../Python/ScriptInterpreterPythonImpl.h      |  10 +-
 32 files changed, 63 insertions(+), 666 deletions(-)
 delete mode 100644 lldb/include/lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h
 delete mode 100644 lldb/source/Interpreter/Interfaces/CMakeLists.txt
 delete mode 100644 lldb/source/Interpreter/Interfaces/ScriptedInterfaceUsages.cpp
 rename lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/{OperatingSystemPythonInterface => }/OperatingSystemPythonInterface.cpp (75%)
 rename lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/{OperatingSystemPythonInterface => }/OperatingSystemPythonInterface.h (77%)
 delete mode 100644 lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/CMakeLists.txt
 rename lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/{ScriptedPlatformPythonInterface => }/ScriptedPlatformPythonInterface.cpp (82%)
 rename lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/{ScriptedPlatformPythonInterface => }/ScriptedPlatformPythonInterface.h (79%)
 delete mode 100644 lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/CMakeLists.txt
 rename lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/{ScriptedProcessPythonInterface => }/ScriptedProcessPythonInterface.cpp (85%)
 rename lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/{ScriptedProcessPythonInterface => }/ScriptedProcessPythonInterface.h (84%)
 delete mode 100644 lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/CMakeLists.txt
 rename lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/{ScriptedThreadPlanPythonInterface => }/ScriptedThreadPlanPythonInterface.cpp (77%)
 rename lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/{ScriptedThreadPlanPythonInterface => }/ScriptedThreadPlanPythonInterface.h (77%)
 delete mode 100644 lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/CMakeLists.txt

diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h
index a23f834f471fb..38a291d9f0afd 100644
--- a/lldb/include/lldb/Core/PluginManager.h
+++ b/lldb/include/lldb/Core/PluginManager.h
@@ -10,7 +10,6 @@
 #define LLDB_CORE_PLUGINMANAGER_H
 
 #include "lldb/Core/Architecture.h"
-#include "lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h"
 #include "lldb/Symbol/TypeSystem.h"
 #include "lldb/Utility/CompletionRequest.h"
 #include "lldb/Utility/FileSpec.h"
@@ -488,25 +487,6 @@ class PluginManager {
 
   static LanguageSet GetAllTypeSystemSupportedLanguagesForExpressions();
 
-  // Scripted Interface
-  static bool RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
-                             ScriptedInterfaceCreateInstance create_callback,
-                             lldb::ScriptLanguage language,
-                             ScriptedInterfaceUsages usages);
-
-  static bool UnregisterPlugin(ScriptedInterfaceCreateInstance create_callback);
-
-  static uint32_t GetNumScriptedInterfaces();
-
-  static llvm::StringRef GetScriptedInterfaceNameAtIndex(uint32_t idx);
-
-  static llvm::StringRef GetScriptedInterfaceDescriptionAtIndex(uint32_t idx);
-
-  static lldb::ScriptLanguage GetScriptedInterfaceLanguageAtIndex(uint32_t idx);
-
-  static ScriptedInterfaceUsages
-  GetScriptedInterfaceUsagesAtIndex(uint32_t idx);
-
   // REPL
   static bool RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
                              REPLCreateInstance create_callback,
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
index 3ce47d0584a8a..69504dbcda5dc 100644
--- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
@@ -9,8 +9,6 @@
 #ifndef LLDB_INTERPRETER_INTERFACES_SCRIPTEDINTERFACE_H
 #define LLDB_INTERPRETER_INTERFACES_SCRIPTEDINTERFACE_H
 
-#include "ScriptedInterfaceUsages.h"
-
 #include "lldb/Core/StructuredDataImpl.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
@@ -70,11 +68,6 @@ class ScriptedInterface {
     return true;
   }
 
-  static bool CreateInstance(lldb::ScriptLanguage language,
-                             ScriptedInterfaceUsages usages) {
-    return false;
-  }
-
 protected:
   StructuredData::GenericSP m_object_instance_sp;
 };
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h
deleted file mode 100644
index 36c0cfdca546e..0000000000000
--- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h
+++ /dev/null
@@ -1,43 +0,0 @@
-//===-- ScriptedInterfaceUsages.h ---------------------------- -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLDB_INTERPRETER_SCRIPTEDINTERFACEUSAGES_H
-#define LLDB_INTERPRETER_SCRIPTEDINTERFACEUSAGES_H
-
-#include "lldb/lldb-types.h"
-
-#include "lldb/Utility/Stream.h"
-#include "llvm/ADT/StringRef.h"
-
-namespace lldb_private {
-class ScriptedInterfaceUsages {
-public:
-  ScriptedInterfaceUsages() = default;
-  ScriptedInterfaceUsages(const std::vector<llvm::StringRef> ci_usages,
-                          const std::vector<llvm::StringRef> sbapi_usages)
-      : m_command_interpreter_usages(ci_usages), m_sbapi_usages(sbapi_usages) {}
-
-  const std::vector<llvm::StringRef> &GetCommandInterpreterUsages() const {
-    return m_command_interpreter_usages;
-  }
-
-  const std::vector<llvm::StringRef> &GetSBAPIUsages() const {
-    return m_sbapi_usages;
-  }
-
-  enum class UsageKind { CommandInterpreter, API };
-
-  void Dump(Stream &s, UsageKind kind) const;
-
-private:
-  std::vector<llvm::StringRef> m_command_interpreter_usages;
-  std::vector<llvm::StringRef> m_sbapi_usages;
-};
-} // namespace lldb_private
-
-#endif // LLDB_INTERPRETER_SCRIPTEDINTERFACEUSAGES_H
diff --git a/lldb/include/lldb/lldb-private-interfaces.h b/lldb/include/lldb/lldb-private-interfaces.h
index 87c5ff8d22fb6..10eaf1e6a5add 100644
--- a/lldb/include/lldb/lldb-private-interfaces.h
+++ b/lldb/include/lldb/lldb-private-interfaces.h
@@ -25,7 +25,6 @@ class Value;
 } // namespace llvm
 
 namespace lldb_private {
-class ScriptedInterfaceUsages;
 typedef lldb::ABISP (*ABICreateInstance)(lldb::ProcessSP process_sp,
                                          const ArchSpec &arch);
 typedef std::unique_ptr<Architecture> (*ArchitectureCreateInstance)(
@@ -125,8 +124,6 @@ typedef lldb::REPLSP (*REPLCreateInstance)(Status &error,
                                            lldb::LanguageType language,
                                            Debugger *debugger, Target *target,
                                            const char *repl_options);
-typedef bool (*ScriptedInterfaceCreateInstance)(lldb::ScriptLanguage language,
-                                                ScriptedInterfaceUsages usages);
 typedef int (*ComparisonFunction)(const void *, const void *);
 typedef void (*DebuggerInitializeCallback)(Debugger &debugger);
 /// Trace
diff --git a/lldb/source/Commands/CommandObjectScripting.cpp b/lldb/source/Commands/CommandObjectScripting.cpp
index 730a190a6e891..fee0565a7c48a 100644
--- a/lldb/source/Commands/CommandObjectScripting.cpp
+++ b/lldb/source/Commands/CommandObjectScripting.cpp
@@ -8,14 +8,12 @@
 
 #include "CommandObjectScripting.h"
 #include "lldb/Core/Debugger.h"
-#include "lldb/Core/PluginManager.h"
 #include "lldb/DataFormatters/DataVisualization.h"
 #include "lldb/Host/Config.h"
 #include "lldb/Host/OptionParser.h"
 #include "lldb/Interpreter/CommandInterpreter.h"
 #include "lldb/Interpreter/CommandOptionArgumentTable.h"
 #include "lldb/Interpreter/CommandReturnObject.h"
-#include "lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h"
 #include "lldb/Interpreter/OptionArgParser.h"
 #include "lldb/Interpreter/ScriptInterpreter.h"
 #include "lldb/Utility/Args.h"
@@ -129,126 +127,9 @@ class CommandObjectScriptingRun : public CommandObjectRaw {
   CommandOptions m_options;
 };
 
-#define LLDB_OPTIONS_scripting_template_list
-#include "CommandOptions.inc"
-
-class CommandObjectScriptingTemplateList : public CommandObjectParsed {
-public:
-  CommandObjectScriptingTemplateList(CommandInterpreter &interpreter)
-      : CommandObjectParsed(
-            interpreter, "scripting template list",
-            "List all the available scripting extension templates. ",
-            "scripting template list [--language <scripting-language> --]") {}
-
-  ~CommandObjectScriptingTemplateList() override = default;
-
-  Options *GetOptions() override { return &m_options; }
-
-  class CommandOptions : public Options {
-  public:
-    CommandOptions() = default;
-    ~CommandOptions() override = default;
-    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
-                          ExecutionContext *execution_context) override {
-      Status error;
-      const int short_option = m_getopt_table[option_idx].val;
+#pragma mark CommandObjectMultiwordScripting
 
-      switch (short_option) {
-      case 'l':
-        m_language = (lldb::ScriptLanguage)OptionArgParser::ToOptionEnum(
-            option_arg, GetDefinitions()[option_idx].enum_values,
-            eScriptLanguageNone, error);
-        if (!error.Success())
-          error.SetErrorStringWithFormatv(
-              "unrecognized value for language '{0}'", option_arg);
-        break;
-      default:
-        llvm_unreachable("Unimplemented option");
-      }
-
-      return error;
-    }
-
-    void OptionParsingStarting(ExecutionContext *execution_context) override {
-      m_language = lldb::eScriptLanguageDefault;
-    }
-
-    llvm::ArrayRef<OptionDefinition> GetDefinitions() override {
-      return llvm::ArrayRef(g_scripting_template_list_options);
-    }
-
-    lldb::ScriptLanguage m_language = lldb::eScriptLanguageDefault;
-  };
-
-protected:
-  void DoExecute(Args &command, CommandReturnObject &result) override {
-    Stream &s = result.GetOutputStream();
-    s.Printf("Available scripted extension templates:");
-
-    auto print_field = [&s](llvm::StringRef key, llvm::StringRef value) {
-      if (!value.empty()) {
-        s.IndentMore();
-        s.Indent();
-        s << key << ": " << value << '\n';
-        s.IndentLess();
-      }
-    };
-
-    size_t num_listed_interface = 0;
-    size_t num_templates = PluginManager::GetNumScriptedInterfaces();
-    for (size_t i = 0; i < num_templates; i++) {
-      llvm::StringRef plugin_name =
-          PluginManager::GetScriptedInterfaceNameAtIndex(i);
-      if (plugin_name.empty())
-        break;
-
-      lldb::ScriptLanguage lang =
-          PluginManager::GetScriptedInterfaceLanguageAtIndex(i);
-      if (lang != m_options.m_language)
-        continue;
-
-      if (!num_listed_interface)
-        s.EOL();
-
-      num_listed_interface++;
-
-      llvm::StringRef desc =
-          PluginManager::GetScriptedInterfaceDescriptionAtIndex(i);
-      ScriptedInterfaceUsages usages =
-          PluginManager::GetScriptedInterfaceUsagesAtIndex(i);
-
-      print_field("Name", plugin_name);
-      print_field("Language", ScriptInterpreter::LanguageToString(lang));
-      print_field("Description", desc);
-      usages.Dump(s, ScriptedInterfaceUsages::UsageKind::API);
-      usages.Dump(s, ScriptedInterfaceUsages::UsageKind::CommandInterpreter);
-
-      if (i != num_templates - 1)
-        s.EOL();
-    }
-
-    if (!num_listed_interface)
-      s << " None\n";
-  }
-
-private:
-  CommandOptions m_options;
-};
-
-class CommandObjectMultiwordScriptingTemplate : public CommandObjectMultiword {
-public:
-  CommandObjectMultiwordScriptingTemplate(CommandInterpreter &interpreter)
-      : CommandObjectMultiword(
-            interpreter, "scripting template",
-            "Commands for operating on the scripting templates.",
-            "scripting template [<subcommand-options>]") {
-    LoadSubCommand(
-        "list",
-        CommandObjectSP(new CommandObjectScriptingTemplateList(interpreter)));
-  }
-
-  ~CommandObjectMultiwordScriptingTemplate() override = default;
-};
+// CommandObjectMultiwordScripting
 
 CommandObjectMultiwordScripting::CommandObjectMultiwordScripting(
     CommandInterpreter &interpreter)
@@ -258,9 +139,6 @@ CommandObjectMultiwordScripting::CommandObjectMultiwordScripting(
           "scripting <subcommand> [<subcommand-options>]") {
   LoadSubCommand("run",
                  CommandObjectSP(new CommandObjectScriptingRun(interpreter)));
-  LoadSubCommand("template",
-                 CommandObjectSP(
-                     new CommandObjectMultiwordScriptingTemplate(interpreter)));
 }
 
 CommandObjectMultiwordScripting::~CommandObjectMultiwordScripting() = default;
diff --git a/lldb/source/Commands/Options.td b/lldb/source/Commands/Options.td
index 6e5ed21b22ad8..24e97f3bb97d3 100644
--- a/lldb/source/Commands/Options.td
+++ b/lldb/source/Commands/Options.td
@@ -841,12 +841,6 @@ let Command = "scripting run" in {
     " language. If none is specific the default scripting language is used.">;
 }
 
-let Command = "scripting template list" in {
-  def scripting_template_list_language : Option<"language", "l">,
-    EnumArg<"ScriptLang">, Desc<"Specify the scripting "
-    " language. If none is specified the default scripting language is used.">;
-}
-
 let Command = "source info" in {
   def source_info_count : Option<"count", "c">, Arg<"Count">,
     Desc<"The number of line entries to display.">;
diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp
index 01bee8680b7ba..759ef3a8afe02 100644
--- a/lldb/source/Core/PluginManager.cpp
+++ b/lldb/source/Core/PluginManager.cpp
@@ -1505,70 +1505,6 @@ LanguageSet PluginManager::GetAllTypeSystemSupportedLanguagesForExpressions() {
   return all;
 }
 
-#pragma mark ScriptedInterfaces
-
-struct ScriptedInterfaceInstance
-    : public PluginInstance<ScriptedInterfaceCreateInstance> {
-  ScriptedInterfaceInstance(llvm::StringRef name, llvm::StringRef description,
-                            ScriptedInterfaceCreateInstance create_callback,
-                            lldb::ScriptLanguage language,
-                            ScriptedInterfaceUsages usages)
-      : PluginInstance<ScriptedInterfaceCreateInstance>(name, description,
-                                                        create_callback),
-        language(language), usages(usages) {}
-
-  lldb::ScriptLanguage language;
-  ScriptedInterfaceUsages usages;
-};
-
-typedef PluginInstances<ScriptedInterfaceInstance> ScriptedInterfaceInstances;
-
-static ScriptedInterfaceInstances &GetScriptedInterfaceInstances() {
-  static ScriptedInterfaceInstances g_instances;
-  return g_instances;
-}
-
-bool PluginManager::RegisterPlugin(
-    llvm::StringRef name, llvm::StringRef description,
-    ScriptedInterfaceCreateInstance create_callback,
-    lldb::ScriptLanguage language, ScriptedInterfaceUsages usages) {
-  return GetScriptedInterfaceInstances().RegisterPlugin(
-      name, description, create_callback, language, usages);
-}
-
-bool PluginManager::UnregisterPlugin(
-    ScriptedInterfaceCreateInstance create_callback) {
-  return GetScriptedInterfaceInstances().UnregisterPlugin(create_callback);
-}
-
-uint32_t PluginManager::GetNumScriptedInterfaces() {
-  return GetScriptedInterfaceInstances().GetInstances().size();
-}
-
-llvm::StringRef PluginManager::GetScriptedInterfaceNameAtIndex(uint32_t index) {
-  return GetScriptedInterfaceInstances().GetNameAtIndex(index);
-}
-
-llvm::StringRef
-PluginManager::GetScriptedInterfaceDescriptionAtIndex(uint32_t index) {
-  return GetScriptedInterfaceInstances().GetDescriptionAtIndex(index);
-}
-
-lldb::ScriptLanguage
-PluginManager::GetScriptedInterfaceLanguageAtIndex(uint32_t idx) {
-  const auto &instances = GetScriptedInterfaceInstances().GetInstances();
-  return idx < instances.size() ? instances[idx].language
-                                : ScriptLanguage::eScriptLanguageNone;
-}
-
-ScriptedInterfaceUsages
-PluginManager::GetScriptedInterfaceUsagesAtIndex(uint32_t idx) {
-  const auto &instances = GetScriptedInterfaceInstances().GetInstances();
-  if (idx >= instances.size())
-    return {};
-  return instances[idx].usages;
-}
-
 #pragma mark REPL
 
 struct REPLInstance : public PluginInstance<REPLCreateInstance> {
@@ -1629,7 +1565,6 @@ void PluginManager::DebuggerInitialize(Debugger &debugger) {
   GetOperatingSystemInstances().PerformDebuggerCallback(debugger);
   GetStructuredDataPluginInstances().PerformDebuggerCallback(debugger);
   GetTracePluginInstances().PerformDebuggerCallback(debugger);
-  GetScriptedInterfaceInstances().PerformDebuggerCallback(debugger);
 }
 
 // This is the preferred new way to register plugin specific settings.  e.g.
diff --git a/lldb/source/Interpreter/CMakeLists.txt b/lldb/source/Interpreter/CMakeLists.txt
index 642263a8bda7f..ae79b82d7c3e2 100644
--- a/lldb/source/Interpreter/CMakeLists.txt
+++ b/lldb/source/Interpreter/CMakeLists.txt
@@ -6,8 +6,6 @@ lldb_tablegen(InterpreterPropertiesEnum.inc -gen-lldb-property-enum-defs
   SOURCE InterpreterProperties.td
   TARGET LLDBInterpreterPropertiesEnumGen)
 
-add_subdirectory(Interfaces)
-
 add_lldb_library(lldbInterpreter NO_PLUGIN_DEPENDENCIES
   CommandAlias.cpp
   CommandHistory.cpp
@@ -56,7 +54,6 @@ add_lldb_library(lldbInterpreter NO_PLUGIN_DEPENDENCIES
   ScriptInterpreter.cpp
 
   LINK_LIBS
-    lldbInterpreterInterfaces
     lldbCommands
     lldbCore
     lldbDataFormatters
@@ -69,7 +66,6 @@ add_lldb_library(lldbInterpreter NO_PLUGIN_DEPENDENCIES
   )
 
 add_dependencies(lldbInterpreter
-  lldbInterpreterInterfaces
   LLDBInterpreterPropertiesGen
   LLDBInterpreterPropertiesEnumGen)
 
diff --git a/lldb/source/Interpreter/Interfaces/CMakeLists.txt b/lldb/source/Interpreter/Interfaces/CMakeLists.txt
deleted file mode 100644
index f44672aa50b75..0000000000000
--- a/lldb/source/Interpreter/Interfaces/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-add_lldb_library(lldbInterpreterInterfaces NO_PLUGIN_DEPENDENCIES
-  ScriptedInterfaceUsages.cpp
-
-  LINK_LIBS
-    lldbUtility
-
-  LINK_COMPONENTS
-    Support
-  )
-
diff --git a/lldb/source/Interpreter/Interfaces/ScriptedInterfaceUsages.cpp b/lldb/source/Interpreter/Interfaces/ScriptedInterfaceUsages.cpp
deleted file mode 100644
index 05d7a5d852f8c..0000000000000
--- a/lldb/source/Interpreter/Interfaces/ScriptedInterfaceUsages.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//===-- ScriptedInterfaceUsages.cpp --------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h"
-
-using namespace lldb;
-using namespace lldb_private;
-
-void ScriptedInterfaceUsages::Dump(Stream &s, UsageKind kind) const {
-  s.IndentMore();
-  s.Indent();
-  llvm::StringRef usage_kind =
-      (kind == UsageKind::CommandInterpreter) ? "Command Interpreter" : "API";
-  s << usage_kind << " Usages:";
-  const std::vector<llvm::StringRef> &usages =
-      (kind == UsageKind::CommandInterpreter) ? GetCommandInterpreterUsages()
-                                              : GetSBAPIUsages();
-  if (usages.empty())
-    s << " None\n";
-  else if (usages.size() == 1)
-    s << " " << usages.front() << '\n';
-  else {
-    s << '\n';
-    for (llvm::StringRef usage : usages) {
-      s.IndentMore();
-      s.Indent();
-      s << usage << '\n';
-      s.IndentLess();
-    }
-  }
-  s.IndentLess();
-}
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
index eb22a960b5345..c60e4bb503a37 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
@@ -20,8 +20,12 @@ if (LLDB_ENABLE_LIBEDIT)
 endif()
 
 add_lldb_library(lldbPluginScriptInterpreterPythonInterfaces
+  OperatingSystemPythonInterface.cpp
   ScriptedPythonInterface.cpp
+  ScriptedProcessPythonInterface.cpp
   ScriptedThreadPythonInterface.cpp
+  ScriptedThreadPlanPythonInterface.cpp
+  ScriptedPlatformPythonInterface.cpp
 
   LINK_LIBS
     lldbCore
@@ -34,9 +38,3 @@ add_lldb_library(lldbPluginScriptInterpreterPythonInterfaces
   LINK_COMPONENTS
     Support
   )
-
-add_subdirectory(OperatingSystemPythonInterface)
-add_subdirectory(ScriptedPlatformPythonInterface)
-add_subdirectory(ScriptedProcessPythonInterface)
-add_subdirectory(ScriptedThreadPlanPythonInterface)
-
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp
similarity index 75%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.cpp
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp
index 019db269a905b..c162c7367c654 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "lldb/Core/PluginManager.h"
 #include "lldb/Host/Config.h"
 #include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/Log.h"
@@ -14,13 +13,11 @@
 
 #if LLDB_ENABLE_PYTHON
 
-// clang-format off
 // LLDB Python header must be included first
-#include "../../lldb-python.h"
-//clang-format on
+#include "../lldb-python.h"
 
-#include "../../SWIGPythonBridge.h"
-#include "../../ScriptInterpreterPythonImpl.h"
+#include "../SWIGPythonBridge.h"
+#include "../ScriptInterpreterPythonImpl.h"
 #include "OperatingSystemPythonInterface.h"
 
 using namespace lldb;
@@ -28,8 +25,6 @@ using namespace lldb_private;
 using namespace lldb_private::python;
 using Locker = ScriptInterpreterPythonImpl::Locker;
 
-LLDB_PLUGIN_DEFINE_ADV(OperatingSystemPythonInterface, ScriptInterpreterPythonOperatingSystemPythonInterface)
-
 OperatingSystemPythonInterface::OperatingSystemPythonInterface(
     ScriptInterpreterPythonImpl &interpreter)
     : OperatingSystemInterface(), ScriptedThreadPythonInterface(interpreter) {}
@@ -84,18 +79,4 @@ OperatingSystemPythonInterface::GetRegisterContextForTID(lldb::tid_t tid) {
   return obj->GetAsString()->GetValue().str();
 }
 
-void OperatingSystemPythonInterface::Initialize() {
-  const std::vector<llvm::StringRef> ci_usages = {
-      "settings set target.process.python-os-plugin-path <script-path>",
-      "settings set process.experimental.os-plugin-reports-all-threads [0/1]"};
-  const std::vector<llvm::StringRef> api_usages = {};
-  PluginManager::RegisterPlugin(
-      GetPluginNameStatic(), llvm::StringRef("Mock thread state"),
-      CreateInstance, eScriptLanguagePython, {ci_usages, api_usages});
-}
-
-void OperatingSystemPythonInterface::Terminate() {
-  PluginManager::UnregisterPlugin(CreateInstance);
-}
-
 #endif
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.h
similarity index 77%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.h
index 001d49decb5b6..da7bbf13b1d55 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.h
@@ -9,28 +9,18 @@
 #ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_OPERATINGSYSTEMPYTHONINTERFACE_H
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_OPERATINGSYSTEMPYTHONINTERFACE_H
 
-#if LLDB_ENABLE_PYTHON
-
-// clang-format off
-// LLDB Python header must be included first
-#include "../../lldb-python.h"
-//clang-format on
-#endif
-
 #include "lldb/Host/Config.h"
-#include "lldb/Interpreter/Interfaces/OperatingSystemInterface.h"
 
 #if LLDB_ENABLE_PYTHON
 
-#include "../ScriptedThreadPythonInterface.h"
-
+#include "ScriptedThreadPythonInterface.h"
+#include "lldb/Interpreter/Interfaces/OperatingSystemInterface.h"
 #include <optional>
 
 namespace lldb_private {
 class OperatingSystemPythonInterface
     : virtual public OperatingSystemInterface,
-      virtual public ScriptedThreadPythonInterface,
-      public PluginInterface {
+      virtual public ScriptedThreadPythonInterface {
 public:
   OperatingSystemPythonInterface(ScriptInterpreterPythonImpl &interpreter);
 
@@ -51,16 +41,6 @@ class OperatingSystemPythonInterface
   StructuredData::DictionarySP GetRegisterInfo() override;
 
   std::optional<std::string> GetRegisterContextForTID(lldb::tid_t tid) override;
-
-  static void Initialize();
-
-  static void Terminate();
-
-  static llvm::StringRef GetPluginNameStatic() {
-    return "OperatingSystemPythonInterface";
-  }
-
-  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
 };
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/CMakeLists.txt
deleted file mode 100644
index b48f1e818e5d5..0000000000000
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-add_lldb_library(lldbPluginScriptInterpreterPythonOperatingSystemPythonInterface PLUGIN
-
-  OperatingSystemPythonInterface.cpp
-
-  LINK_LIBS
-    lldbCore
-    lldbHost
-    lldbInterpreter
-    lldbTarget
-    lldbPluginScriptInterpreterPython
-    ${Python3_LIBRARIES}
-    ${LLDB_LIBEDIT_LIBS}
-
-  LINK_COMPONENTS
-    Support
-  )
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp
similarity index 82%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.cpp
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp
index 2b701f85010ed..6e93bec80056e 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp
@@ -6,34 +6,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-#if LLDB_ENABLE_PYTHON
-
-// clang-format off
-// LLDB Python header must be included first
-#include "../../lldb-python.h"
-//clang-format on
-#endif
-
-#include "lldb/Core/PluginManager.h"
 #include "lldb/Host/Config.h"
-#include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/Status.h"
 #include "lldb/lldb-enumerations.h"
 
 #if LLDB_ENABLE_PYTHON
 
-#include "../../SWIGPythonBridge.h"
-#include "../../ScriptInterpreterPythonImpl.h"
+// LLDB Python header must be included first
+#include "../lldb-python.h"
+
+#include "../SWIGPythonBridge.h"
+#include "../ScriptInterpreterPythonImpl.h"
 #include "ScriptedPlatformPythonInterface.h"
 
+#include "lldb/Target/ExecutionContext.h"
+
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::python;
 using Locker = ScriptInterpreterPythonImpl::Locker;
 
-LLDB_PLUGIN_DEFINE_ADV(ScriptedPlatformPythonInterface, ScriptInterpreterPythonScriptedPlatformPythonInterface)
-
 ScriptedPlatformPythonInterface::ScriptedPlatformPythonInterface(
     ScriptInterpreterPythonImpl &interpreter)
     : ScriptedPlatformInterface(), ScriptedPythonInterface(interpreter) {}
@@ -100,14 +93,4 @@ Status ScriptedPlatformPythonInterface::KillProcess(lldb::pid_t pid) {
   return GetStatusFromMethod("kill_process", pid);
 }
 
-void ScriptedPlatformPythonInterface::Initialize() {
-  PluginManager::RegisterPlugin(
-      GetPluginNameStatic(), "Mock platform and interact with its processes.",
-      CreateInstance, eScriptLanguagePython, {});
-}
-
-void ScriptedPlatformPythonInterface::Terminate() {
-  PluginManager::UnregisterPlugin(CreateInstance);
-}
-
 #endif // LLDB_ENABLE_PYTHON
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.h
similarity index 79%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.h
index c4b4e45b69e84..0842d3a003429 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.h
@@ -9,25 +9,16 @@
 #ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDPLATFORMPYTHONINTERFACE_H
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDPLATFORMPYTHONINTERFACE_H
 
-#if LLDB_ENABLE_PYTHON
-
-// clang-format off
-// LLDB Python header must be included first
-#include "../../lldb-python.h"
-//clang-format on
-#endif
-
 #include "lldb/Host/Config.h"
-#include "lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h"
 
 #if LLDB_ENABLE_PYTHON
 
-#include "../ScriptedPythonInterface.h"
+#include "ScriptedPythonInterface.h"
+#include "lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h"
 
 namespace lldb_private {
 class ScriptedPlatformPythonInterface : public ScriptedPlatformInterface,
-                                        public ScriptedPythonInterface,
-                                        public PluginInterface {
+                                        public ScriptedPythonInterface {
 public:
   ScriptedPlatformPythonInterface(ScriptInterpreterPythonImpl &interpreter);
 
@@ -52,16 +43,6 @@ class ScriptedPlatformPythonInterface : public ScriptedPlatformInterface,
   Status LaunchProcess(lldb::ProcessLaunchInfoSP launch_info) override;
 
   Status KillProcess(lldb::pid_t pid) override;
-
-  static void Initialize();
-
-  static void Terminate();
-
-  static llvm::StringRef GetPluginNameStatic() {
-    return "ScriptedPlatformPythonInterface";
-  }
-
-  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
 };
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/CMakeLists.txt
deleted file mode 100644
index ae5e525229c02..0000000000000
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-add_lldb_library(lldbPluginScriptInterpreterPythonScriptedPlatformPythonInterface PLUGIN
-
-  ScriptedPlatformPythonInterface.cpp
-
-  LINK_LIBS
-    lldbCore
-    lldbHost
-    lldbInterpreter
-    lldbTarget
-    lldbPluginScriptInterpreterPython
-    ${Python3_LIBRARIES}
-    ${LLDB_LIBEDIT_LIBS}
-
-  LINK_COMPONENTS
-    Support
-  )
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp
similarity index 85%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.cpp
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp
index 794c3e87ba97f..313c597ce48f3 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp
@@ -6,16 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "lldb/Host/Config.h"
 #if LLDB_ENABLE_PYTHON
-
-// clang-format off
 // LLDB Python header must be included first
-#include "../../lldb-python.h"
-//clang-format on
+#include "../lldb-python.h"
 #endif
-
-#include "lldb/Core/PluginManager.h"
-#include "lldb/Host/Config.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/Status.h"
@@ -23,11 +18,10 @@
 
 #if LLDB_ENABLE_PYTHON
 
-#include "../../SWIGPythonBridge.h"
-#include "../../ScriptInterpreterPythonImpl.h"
-#include "../ScriptedThreadPythonInterface.h"
+#include "../SWIGPythonBridge.h"
+#include "../ScriptInterpreterPythonImpl.h"
 #include "ScriptedProcessPythonInterface.h"
-
+#include "ScriptedThreadPythonInterface.h"
 #include <optional>
 
 using namespace lldb;
@@ -35,8 +29,6 @@ using namespace lldb_private;
 using namespace lldb_private::python;
 using Locker = ScriptInterpreterPythonImpl::Locker;
 
-LLDB_PLUGIN_DEFINE_ADV(ScriptedProcessPythonInterface, ScriptInterpreterPythonScriptedProcessPythonInterface)
-
 ScriptedProcessPythonInterface::ScriptedProcessPythonInterface(
     ScriptInterpreterPythonImpl &interpreter)
     : ScriptedProcessInterface(), ScriptedPythonInterface(interpreter) {}
@@ -216,24 +208,4 @@ StructuredData::DictionarySP ScriptedProcessPythonInterface::GetMetadata() {
   return dict;
 }
 
-void ScriptedProcessPythonInterface::Initialize() {
-  const std::vector<llvm::StringRef> ci_usages = {
-      "process attach -C <script-name> [-k key -v value ...]",
-      "process launch -C <script-name> [-k key -v value ...]"};
-  const std::vector<llvm::StringRef> api_usages = {
-      "SBAttachInfo.SetScriptedProcessClassName",
-      "SBAttachInfo.SetScriptedProcessDictionary",
-      "SBTarget.Attach",
-      "SBLaunchInfo.SetScriptedProcessClassName",
-      "SBLaunchInfo.SetScriptedProcessDictionary",
-      "SBTarget.Launch"};
-  PluginManager::RegisterPlugin(
-      GetPluginNameStatic(), llvm::StringRef("Mock process state"),
-      CreateInstance, eScriptLanguagePython, {ci_usages, api_usages});
-}
-
-void ScriptedProcessPythonInterface::Terminate() {
-  PluginManager::UnregisterPlugin(CreateInstance);
-}
-
 #endif
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.h
similarity index 84%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.h
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.h
index 996cbd63e0897..c75caa9340f25 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.h
@@ -9,27 +9,17 @@
 #ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDPROCESSPYTHONINTERFACE_H
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDPROCESSPYTHONINTERFACE_H
 
-#if LLDB_ENABLE_PYTHON
-
-// clang-format off
-// LLDB Python header must be included first
-#include "../../lldb-python.h"
-//clang-format on
-#endif
-
 #include "lldb/Host/Config.h"
-#include "lldb/Interpreter/Interfaces/ScriptedProcessInterface.h"
 
 #if LLDB_ENABLE_PYTHON
 
-#include "../ScriptedPythonInterface.h"
-
+#include "ScriptedPythonInterface.h"
+#include "lldb/Interpreter/Interfaces/ScriptedProcessInterface.h"
 #include <optional>
 
 namespace lldb_private {
 class ScriptedProcessPythonInterface : public ScriptedProcessInterface,
-                                       public ScriptedPythonInterface,
-                                       public PluginInterface {
+                                       public ScriptedPythonInterface {
 public:
   ScriptedProcessPythonInterface(ScriptInterpreterPythonImpl &interpreter);
 
@@ -77,16 +67,6 @@ class ScriptedProcessPythonInterface : public ScriptedProcessInterface,
 
   StructuredData::DictionarySP GetMetadata() override;
 
-  static void Initialize();
-
-  static void Terminate();
-
-  static llvm::StringRef GetPluginNameStatic() {
-    return "ScriptedProcessPythonInterface";
-  }
-
-  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
-
 private:
   lldb::ScriptedThreadInterfaceSP CreateScriptedThreadInterface() override;
 };
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/CMakeLists.txt
deleted file mode 100644
index 66ed041853f67..0000000000000
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-add_lldb_library(lldbPluginScriptInterpreterPythonScriptedProcessPythonInterface PLUGIN
-
-  ScriptedProcessPythonInterface.cpp
-
-  LINK_LIBS
-    lldbCore
-    lldbHost
-    lldbInterpreter
-    lldbTarget
-    lldbPluginScriptInterpreterPython
-    ${Python3_LIBRARIES}
-    ${LLDB_LIBEDIT_LIBS}
-
-  LINK_COMPONENTS
-    Support
-  )
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
index 7bb1bfedf2f29..699412e437a1a 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
@@ -6,20 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#if LLDB_ENABLE_PYTHON
-
-// clang-format off
-// LLDB Python header must be included first
-#include "../../lldb-python.h"
-//clang-format on
-#endif
-
 #include "lldb/Host/Config.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/lldb-enumerations.h"
 
 #if LLDB_ENABLE_PYTHON
 
+// LLDB Python header must be included first
+#include "../lldb-python.h"
+
 #include "../ScriptInterpreterPythonImpl.h"
 #include "ScriptedPythonInterface.h"
 #include <optional>
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
index d92e45e265f68..e1a3156d10afd 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
@@ -10,15 +10,6 @@
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDPYTHONINTERFACE_H
 
 #if LLDB_ENABLE_PYTHON
-// clang-format off
-// LLDB Python header must be included first
-#include "../lldb-python.h"
-//clang-format on
-#endif
-
-#include "lldb/Host/Config.h"
-#include "lldb/Interpreter/Interfaces/ScriptedInterface.h"
-#include "lldb/Utility/DataBufferHeap.h"
 
 #include <optional>
 #include <sstream>
@@ -26,7 +17,9 @@
 #include <type_traits>
 #include <utility>
 
-#if LLDB_ENABLE_PYTHON
+#include "lldb/Host/Config.h"
+#include "lldb/Interpreter/Interfaces/ScriptedInterface.h"
+#include "lldb/Utility/DataBufferHeap.h"
 
 #include "../PythonDataObjects.h"
 #include "../SWIGPythonBridge.h"
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
similarity index 77%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.cpp
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
index d83ab280d9772..f23858c01277c 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
@@ -6,31 +6,23 @@
 //
 //===----------------------------------------------------------------------===//
 
-#if LLDB_ENABLE_PYTHON
-
-// clang-format off
-// LLDB Python header must be included first
-#include "../../lldb-python.h"
-//clang-format on
-#endif
-
-#include "lldb/Core/PluginManager.h"
 #include "lldb/Host/Config.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/lldb-enumerations.h"
 
 #if LLDB_ENABLE_PYTHON
 
-#include "../../SWIGPythonBridge.h"
-#include "../../ScriptInterpreterPythonImpl.h"
+// LLDB Python header must be included first
+#include "../lldb-python.h"
+
+#include "../SWIGPythonBridge.h"
+#include "../ScriptInterpreterPythonImpl.h"
 #include "ScriptedThreadPlanPythonInterface.h"
 
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::python;
 
-LLDB_PLUGIN_DEFINE_ADV(ScriptedThreadPlanPythonInterface, ScriptInterpreterPythonScriptedThreadPlanPythonInterface)
-
 ScriptedThreadPlanPythonInterface::ScriptedThreadPlanPythonInterface(
     ScriptInterpreterPythonImpl &interpreter)
     : ScriptedThreadPlanInterface(), ScriptedPythonInterface(interpreter) {}
@@ -110,19 +102,4 @@ ScriptedThreadPlanPythonInterface::GetStopDescription(lldb::StreamSP &stream) {
   return llvm::Error::success();
 }
 
-void ScriptedThreadPlanPythonInterface::Initialize() {
-  const std::vector<llvm::StringRef> ci_usages = {
-      "thread step-scripted -C <script-name> [-k key -v value ...]"};
-  const std::vector<llvm::StringRef> api_usages = {
-      "SBThread.StepUsingScriptedThreadPlan"};
-  PluginManager::RegisterPlugin(
-      GetPluginNameStatic(),
-      llvm::StringRef("Alter thread stepping logic and stop reason"),
-      CreateInstance, eScriptLanguagePython, {ci_usages, api_usages});
-}
-
-void ScriptedThreadPlanPythonInterface::Terminate() {
-  PluginManager::UnregisterPlugin(CreateInstance);
-}
-
 #endif
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h
similarity index 77%
rename from lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.h
rename to lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h
index 73b17a556464f..6ec89b9f59253 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h
@@ -9,28 +9,17 @@
 #ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H
 
-#if LLDB_ENABLE_PYTHON
-
-// clang-format off
-// LLDB Python header must be included first
-#include "../../lldb-python.h"
-//clang-format on
-
-#endif
-
 #include "lldb/Host/Config.h"
-#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h"
 
 #if LLDB_ENABLE_PYTHON
 
-#include "../ScriptedPythonInterface.h"
-
+#include "ScriptedPythonInterface.h"
+#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h"
 #include <optional>
 
 namespace lldb_private {
 class ScriptedThreadPlanPythonInterface : public ScriptedThreadPlanInterface,
-                                          public ScriptedPythonInterface,
-                                          public PluginInterface {
+                                          public ScriptedPythonInterface {
 public:
   ScriptedThreadPlanPythonInterface(ScriptInterpreterPythonImpl &interpreter);
 
@@ -52,16 +41,6 @@ class ScriptedThreadPlanPythonInterface : public ScriptedThreadPlanInterface,
   lldb::StateType GetRunState() override;
 
   llvm::Error GetStopDescription(lldb::StreamSP &stream) override;
-
-  static void Initialize();
-
-  static void Terminate();
-
-  static llvm::StringRef GetPluginNameStatic() {
-    return "ScriptedThreadPlanPythonInterface";
-  }
-
-  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
 };
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/CMakeLists.txt
deleted file mode 100644
index db41da165d275..0000000000000
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-add_lldb_library(lldbPluginScriptInterpreterPythonScriptedThreadPlanPythonInterface PLUGIN
-
-  ScriptedThreadPlanPythonInterface.cpp
-
-  LINK_LIBS
-    lldbCore
-    lldbHost
-    lldbInterpreter
-    lldbTarget
-    lldbPluginScriptInterpreterPython
-    ${Python3_LIBRARIES}
-    ${LLDB_LIBEDIT_LIBS}
-
-  LINK_COMPONENTS
-    Support
-  )
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
index baf15f76c83ed..8af89d761764b 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
@@ -6,14 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#if LLDB_ENABLE_PYTHON
-
-// clang-format off
-// LLDB Python header must be included first
-#include "../../lldb-python.h"
-//clang-format on
-#endif
-
 #include "lldb/Host/Config.h"
 #include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/Log.h"
@@ -21,6 +13,9 @@
 
 #if LLDB_ENABLE_PYTHON
 
+// LLDB Python header must be included first
+#include "../lldb-python.h"
+
 #include "../SWIGPythonBridge.h"
 #include "../ScriptInterpreterPythonImpl.h"
 #include "ScriptedThreadPythonInterface.h"
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.h
index 7724f9128c514..5676f7f1d6752 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.h
@@ -9,13 +9,6 @@
 #ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPYTHONINTERFACE_H
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPYTHONINTERFACE_H
 
-#if LLDB_ENABLE_PYTHON
-// clang-format off
-// LLDB Python header must be included first
-#include "../lldb-python.h"
-//clang-format on
-#endif
-
 #include "lldb/Host/Config.h"
 
 #if LLDB_ENABLE_PYTHON
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.h b/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.h
index f5fb605c2bf3b..88c1bb7e729e7 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.h
@@ -48,17 +48,12 @@
 #ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_PYTHONDATAOBJECTS_H
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_PYTHONDATAOBJECTS_H
 
+#include "lldb/Host/Config.h"
+
 #if LLDB_ENABLE_PYTHON
 
-// clang-format off
 // LLDB Python header must be included first
 #include "lldb-python.h"
-//clang-format on
-#endif
-
-#include "lldb/Host/Config.h"
-
-#if LLDB_ENABLE_PYTHON
 
 #include "lldb/Host/File.h"
 #include "lldb/Utility/StructuredData.h"
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
index efe173aeea8ea..3026b6113ae8f 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
@@ -9,21 +9,16 @@
 #ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_SWIGPYTHONBRIDGE_H
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_SWIGPYTHONBRIDGE_H
 
-#if LLDB_ENABLE_PYTHON
-
-// clang-format off
-// LLDB Python header must be included first
-#include "lldb-python.h"
-//clang-format on
-#endif
-
-#include "lldb/Host/Config.h"
-
 #include <optional>
 #include <string>
 
+#include "lldb/Host/Config.h"
+
 #if LLDB_ENABLE_PYTHON
 
+// LLDB Python header must be included first
+#include "lldb-python.h"
+
 #include "Plugins/ScriptInterpreter/Python/PythonDataObjects.h"
 #include "lldb/lldb-forward.h"
 #include "lldb/lldb-types.h"
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index d056bbd03b7c5..70fa6d83e306f 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -6,23 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#if LLDB_ENABLE_PYTHON
-
-// clang-format off
-// LLDB Python header must be included first
-#include "../../lldb-python.h"
-//clang-format on
-#endif
-
 #include "lldb/Host/Config.h"
 #include "lldb/lldb-enumerations.h"
 
 #if LLDB_ENABLE_PYTHON
 
-#include "Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h"
-#include "Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h"
-#include "Interfaces/ScriptedProcessPythonInterface/ScriptedProcessPythonInterface.h"
-#include "Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.h"
+// LLDB Python header must be included first
+#include "lldb-python.h"
+
+#include "Interfaces/OperatingSystemPythonInterface.h"
+#include "Interfaces/ScriptedPlatformPythonInterface.h"
+#include "Interfaces/ScriptedProcessPythonInterface.h"
+#include "Interfaces/ScriptedThreadPlanPythonInterface.h"
 #include "Interfaces/ScriptedThreadPythonInterface.h"
 #include "PythonDataObjects.h"
 #include "PythonReadline.h"
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.h
index c54530e2a5d8f..2e8301a85eb6c 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.h
@@ -9,14 +9,6 @@
 #ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_SCRIPTINTERPRETERPYTHON_H
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_SCRIPTINTERPRETERPYTHON_H
 
-#if LLDB_ENABLE_PYTHON
-
-// clang-format off
-// LLDB Python header must be included first
-#include "lldb-python.h"
-//clang-format on
-#endif
-
 #include "lldb/Host/Config.h"
 
 #if LLDB_ENABLE_PYTHON
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
index 99d5b7ae5553b..c2024efb395d7 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
@@ -9,18 +9,12 @@
 #ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_SCRIPTINTERPRETERPYTHONIMPL_H
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_SCRIPTINTERPRETERPYTHONIMPL_H
 
-#if LLDB_ENABLE_PYTHON
-
-// clang-format off
-// LLDB Python header must be included first
-#include "lldb-python.h"
-//clang-format on
-#endif
-
 #include "lldb/Host/Config.h"
 
 #if LLDB_ENABLE_PYTHON
 
+#include "lldb-python.h"
+
 #include "PythonDataObjects.h"
 #include "ScriptInterpreterPython.h"
 

>From 8644a2aa0f3540c69464f56b3d538e888b6cbdcb Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Thu, 25 Jul 2024 17:11:52 -0700
Subject: [PATCH 223/259] [ELF,test] Improve negative linker script tests

---
 lld/test/ELF/linkerscript/diag.test           | 49 +++++++++++++++++++
 lld/test/ELF/linkerscript/diag1.test          | 15 ------
 lld/test/ELF/linkerscript/diag2.test          | 13 -----
 lld/test/ELF/linkerscript/diag3.test          | 13 -----
 lld/test/ELF/linkerscript/diag4.test          | 14 ------
 lld/test/ELF/linkerscript/diag5.test          | 14 ------
 lld/test/ELF/linkerscript/diag6.test          |  7 ---
 .../invalid.test}                             |  0
 lld/test/ELF/linkerscript/unquoted.test       | 26 ++++++++++
 9 files changed, 75 insertions(+), 76 deletions(-)
 create mode 100644 lld/test/ELF/linkerscript/diag.test
 delete mode 100644 lld/test/ELF/linkerscript/diag1.test
 delete mode 100644 lld/test/ELF/linkerscript/diag2.test
 delete mode 100644 lld/test/ELF/linkerscript/diag3.test
 delete mode 100644 lld/test/ELF/linkerscript/diag4.test
 delete mode 100644 lld/test/ELF/linkerscript/diag5.test
 delete mode 100644 lld/test/ELF/linkerscript/diag6.test
 rename lld/test/ELF/{invalid-linkerscript.test => linkerscript/invalid.test} (100%)
 create mode 100644 lld/test/ELF/linkerscript/unquoted.test

diff --git a/lld/test/ELF/linkerscript/diag.test b/lld/test/ELF/linkerscript/diag.test
new file mode 100644
index 0000000000000..fbc24659a5311
--- /dev/null
+++ b/lld/test/ELF/linkerscript/diag.test
@@ -0,0 +1,49 @@
+# REQUIRES: x86
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 /dev/null -o 0.o
+
+#--- 1.lds
+SECTIONS {
+  .text + { *(.text) }
+  .keep : { *(.keep) } /*
+  comment line 1
+  comment line 2 */
+  .temp : { *(.temp) }
+}
+
+# RUN: not ld.lld -shared 0.o -T 1.lds 2>&1 | FileCheck %s --check-prefix=CHECK1 --match-full-lines --strict-whitespace
+#      CHECK1:{{.*}}:2: malformed number: +
+# CHECK1-NEXT:>>>   .text + { *(.text) }
+# CHECK1-NEXT:>>>         ^
+
+#--- 2.lds
+
+UNKNOWN_TAG {
+  .text : { *(.text) }
+  .keep : { *(.keep) }
+  .temp : { *(.temp) }
+}
+
+# RUN: not ld.lld -shared 0.o -T 2.lds 2>&1 | FileCheck %s --check-prefix=CHECK2 --match-full-lines --strict-whitespace
+#      CHECK2:{{.*}}:2: unknown directive: UNKNOWN_TAG
+# CHECK2-NEXT:>>> UNKNOWN_TAG {
+# CHECK2-NEXT:>>> ^
+
+#--- 3.lds
+SECTIONS {
+  .text : { *(.text) }
+  .keep : { *(.keep) }
+  boom ^temp : { *(.temp) }
+}
+#--- 3a.lds
+INCLUDE "3.lds"
+#--- 3b.lds
+foo = 3;
+INCLUDE "3a.lds"
+
+# RUN: not ld.lld -shared 0.o -T 3.lds 2>&1 | FileCheck %s --check-prefix=CHECK3 --match-full-lines --strict-whitespace
+# RUN: not ld.lld -shared 0.o -T 3a.lds 2>&1 | FileCheck %s --check-prefix=CHECK3 --match-full-lines --strict-whitespace
+# RUN: not ld.lld -shared 0.o -T 3b.lds 2>&1 | FileCheck %s --check-prefix=CHECK3 --match-full-lines --strict-whitespace
+#      CHECK3:{{.*}}3.lds:4: malformed number: ^
+# CHECK3-NEXT:>>>   boom ^temp : { *(.temp) }
+# CHECK3-NEXT:>>>        ^
diff --git a/lld/test/ELF/linkerscript/diag1.test b/lld/test/ELF/linkerscript/diag1.test
deleted file mode 100644
index 829bc5a1bffaf..0000000000000
--- a/lld/test/ELF/linkerscript/diag1.test
+++ /dev/null
@@ -1,15 +0,0 @@
-# REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux /dev/null -o %t.o
-# RUN: not ld.lld -shared %t.o -o /dev/null --script %s 2>&1 | FileCheck -strict-whitespace %s
-
-SECTIONS {
-  .text + { *(.text) }
-  .keep : { *(.keep) } /*
-  comment line 1
-  comment line 2 */
-  .temp : { *(.temp) }
-}
-
-CHECK:      6: malformed number: +
-CHECK-NEXT: >>>   .text + { *(.text) }
-CHECK-NEXT: >>>         ^
diff --git a/lld/test/ELF/linkerscript/diag2.test b/lld/test/ELF/linkerscript/diag2.test
deleted file mode 100644
index aeb623dbb7f4b..0000000000000
--- a/lld/test/ELF/linkerscript/diag2.test
+++ /dev/null
@@ -1,13 +0,0 @@
-# REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux /dev/null -o %t.o
-# RUN: not ld.lld -shared %t.o -o /dev/null --script %s 2>&1 | FileCheck -strict-whitespace %s
-
-UNKNOWN_TAG {
-  .text : { *(.text) }
-  .keep : { *(.keep) }
-  .temp : { *(.temp) }
-}
-
-CHECK:      5: unknown directive: UNKNOWN_TAG
-CHECK-NEXT: >>> UNKNOWN_TAG {
-CHECK-NEXT: >>> ^
diff --git a/lld/test/ELF/linkerscript/diag3.test b/lld/test/ELF/linkerscript/diag3.test
deleted file mode 100644
index 1df8d601db016..0000000000000
--- a/lld/test/ELF/linkerscript/diag3.test
+++ /dev/null
@@ -1,13 +0,0 @@
-# REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux /dev/null -o %t.o
-# RUN: not ld.lld -shared %t.o -o /dev/null --script %s 2>&1 | FileCheck -strict-whitespace %s
-
-SECTIONS {
-  .text : { *(.text) }
-  .keep : { *(.keep) }
-  boom ^temp : { *(.temp) }
-}
-
-# CHECK:      8: malformed number: ^
-# CHECK-NEXT: >>>   boom ^temp : { *(.temp) }
-# CHECK-NEXT: >>>        ^
diff --git a/lld/test/ELF/linkerscript/diag4.test b/lld/test/ELF/linkerscript/diag4.test
deleted file mode 100644
index d93a69a95c61d..0000000000000
--- a/lld/test/ELF/linkerscript/diag4.test
+++ /dev/null
@@ -1,14 +0,0 @@
-# REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux /dev/null -o %t.o
-# RUN: echo "INCLUDE \"%s\"" > %t.script
-# RUN: not ld.lld -shared %t.o -o /dev/null --script %t.script 2>&1 | FileCheck -strict-whitespace %s
-
-SECTIONS {
-  .text : { *(.text) }
-  .keep : { *(.keep) }
-  boom ^temp : { *(.temp) }
-}
-
-# CHECK:      9: malformed number: ^{{$}}
-# CHECK-NEXT: >>>   boom ^temp : { *(.temp) }
-# CHECK-NEXT: >>>        ^
diff --git a/lld/test/ELF/linkerscript/diag5.test b/lld/test/ELF/linkerscript/diag5.test
deleted file mode 100644
index 9a2304baa4413..0000000000000
--- a/lld/test/ELF/linkerscript/diag5.test
+++ /dev/null
@@ -1,14 +0,0 @@
-# REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux /dev/null -o %t.o
-# RUN: echo "INCLUDE \"%s\"" > %t.script
-# RUN: not ld.lld -shared %t.o -o /dev/null --script %t.script 2>&1 | FileCheck -strict-whitespace %s
-
-SECTIONS {
-  .text : { *(.text) }
-  .keep : { *(.keep) }
-  boom ^temp : { *(.temp) }
-}
-
-# CHECK:      9: malformed number: ^
-# CHECK-NEXT: >>>   boom ^temp : { *(.temp) }
-# CHECK-NEXT: >>>        ^
diff --git a/lld/test/ELF/linkerscript/diag6.test b/lld/test/ELF/linkerscript/diag6.test
deleted file mode 100644
index 0ec0400040b54..0000000000000
--- a/lld/test/ELF/linkerscript/diag6.test
+++ /dev/null
@@ -1,7 +0,0 @@
-# REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux /dev/null -o %t.o
-# RUN: not ld.lld -shared %t.o -o /dev/null --script %s 2>&1 | FileCheck %s
-
-SECTIONS /*
-
-CHECK: error: {{.*}}diag6.test:1: unclosed comment in a linker script
diff --git a/lld/test/ELF/invalid-linkerscript.test b/lld/test/ELF/linkerscript/invalid.test
similarity index 100%
rename from lld/test/ELF/invalid-linkerscript.test
rename to lld/test/ELF/linkerscript/invalid.test
diff --git a/lld/test/ELF/linkerscript/unquoted.test b/lld/test/ELF/linkerscript/unquoted.test
new file mode 100644
index 0000000000000..7dca75fe09ab1
--- /dev/null
+++ b/lld/test/ELF/linkerscript/unquoted.test
@@ -0,0 +1,26 @@
+# REQUIRES: x86
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 /dev/null -o 0.o
+
+#--- empty.lds
+#--- 1.lds
+
+SECTIONS /*
+#--- 1a.lds
+foo = 3;
+INCLUDE "empty.lds"
+INCLUDE "1.lds"
+
+# RUN: not ld.lld -shared 0.o -T 1.lds 2>&1 | FileCheck %s --check-prefix=CHECK1 --match-full-lines --strict-whitespace
+# RUN: not ld.lld -shared 0.o -T 1a.lds 2>&1 | FileCheck %s --check-prefix=CHECK1A --match-full-lines --strict-whitespace
+#      CHECK1:{{.*}}error: 1.lds:1: unclosed comment in a linker script
+#     CHECK1A:{{.*}}error: 1a.lds:3: unclosed comment in a linker script
+#CHECK1A-NEXT:>>> INCLUDE "1.lds"
+#CHECK1A-NEXT:>>>         ^
+
+#--- 2.lds
+INCLUDE "empty.lds"
+"
+# RUN: not ld.lld -shared 0.o -T 2.lds 2>&1 | FileCheck %s --check-prefix=CHECK2 --match-full-lines --strict-whitespace
+#      CHECK2:{{.*}}error: 2.lds:2: unclosed quote
+# CHECK2-NOT:{{.}}

>From 25f9415713f9f57760a5322876906dc11385ef8e Mon Sep 17 00:00:00 2001
From: Alan Zhao <ayzhao at google.com>
Date: Thu, 25 Jul 2024 17:38:44 -0700
Subject: [PATCH 224/259] [compiler-rt][ubsan][nfc-ish] Fix a type conversion
 bug (#100665)

If the inline asm version of `ptrauth_strip` is used instead of the
builtin, the inline asm implementation currently returns an unsigned
long, causing an incompatible pointer conversion issue. The spec for
`ptrauth_sign` is that the result has the same type as the original
value, so we add a cast to the result of the inline asm.
---
 compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h b/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h
index b5215c0d49c06..265a9925a15a0 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h
@@ -18,7 +18,7 @@
 // the NOP space so will do nothing when it is not enabled or not available.
 #  define ptrauth_strip(__value, __key) \
     ({                                  \
-      unsigned long ret;                \
+      __typeof(__value) ret;            \
       asm volatile(                     \
           "mov x30, %1\n\t"             \
           "hint #7\n\t"                 \

>From 5bf085921ec23e5fa1ea4a159c55a618a9299ce6 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic at gmail.com>
Date: Fri, 26 Jul 2024 10:03:17 +0900
Subject: [PATCH 225/259] Revert "[llvm][Bazel] Adapt to
 4eb30cfb3474e3770b465cdb39db3b7f6404c3ef"

Since #99276 has been landed, the dependency has become redundant.

This reverts commit aa94a43178e1e1fa4dbe7ee802d46623667067ae.
(llvmorg-19-init-17718-gaa94a43178e1)
---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 64d36c7b7f664..4d443e809d55b 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -944,10 +944,7 @@ cc_library(
     srcs = glob([
         "lib/IR/*.cpp",
         "lib/IR/*.h",
-    ]) + [
-        # To avoid a dependency cycle.
-        "include/llvm/Analysis/IVDescriptors.h",
-    ],
+    ]),
     hdrs = glob(
         [
             "include/llvm/*.h",

>From 978c40b4482c7f87a5d087ade85cee53089146d4 Mon Sep 17 00:00:00 2001
From: Ding Fei <fding at feysh.com>
Date: Fri, 26 Jul 2024 09:34:56 +0800
Subject: [PATCH 226/259] [clang][ASTImporter][NFC] add unittests for unnamed
 EnumDecl (#100545)

These tests are for multiple anonymous EnumDecls structural eq test &
importing.

We found the anonymous enums importing issue a few days ago and tried to
fix it
but 0a6233a68c7b575d05bca0f0c708b7e97cc710d1 already did this. I think
these tests are still useful for regressions.
---
 clang/unittests/AST/ASTImporterTest.cpp       | 37 +++++++++++++++++++
 .../AST/StructuralEquivalenceTest.cpp         | 14 +++++++
 2 files changed, 51 insertions(+)

diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 6d987cc7e9ec6..9b12caa37cf79 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -9783,6 +9783,43 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportExistingEmptyAnonymousEnums) {
   EXPECT_EQ(ImportedE2, ToE1);
 }
 
+TEST_P(ASTImporterOptionSpecificTestBase, ImportMultipleAnonymousEnumDecls) {
+  Decl *ToTU = getToTuDecl("", Lang_CXX03);
+  Decl *FromTU = getTuDecl(
+      R"(
+        struct foo {
+          enum { A };
+          enum { B };
+        };
+      )",
+      Lang_CXX03);
+
+  auto EnumConstA = enumConstantDecl(hasName("A"));
+  auto EnumConstB = enumConstantDecl(hasName("B"));
+
+  auto *FromA = FirstDeclMatcher<EnumConstantDecl>().match(FromTU, EnumConstA);
+  auto *FromB = FirstDeclMatcher<EnumConstantDecl>().match(FromTU, EnumConstB);
+
+  auto *ToA = Import(FromA, Lang_CXX03);
+  auto *ToB = Import(FromB, Lang_CXX03);
+
+  ASSERT_TRUE(ToA);
+  ASSERT_TRUE(ToB);
+
+  auto *ToFooA = FirstDeclMatcher<CXXRecordDecl>().match(
+      ToTU, tagDecl(has(enumDecl(has(EnumConstA)))));
+  auto *ToFooB = FirstDeclMatcher<CXXRecordDecl>().match(
+      ToTU, tagDecl(has(enumDecl(has(EnumConstB)))));
+  ASSERT_EQ(ToFooA, ToFooB);
+
+  // different EnumDecl
+  auto *ToEnumDeclA =
+      FirstDeclMatcher<EnumDecl>().match(ToTU, enumDecl(has(EnumConstA)));
+  auto *ToEnumDeclB =
+      FirstDeclMatcher<EnumDecl>().match(ToTU, enumDecl(has(EnumConstB)));
+  ASSERT_NE(ToEnumDeclA, ToEnumDeclB);
+}
+
 INSTANTIATE_TEST_SUITE_P(ParameterizedTests, ASTImporterLookupTableTest,
                          DefaultTestValuesForRunOptions);
 
diff --git a/clang/unittests/AST/StructuralEquivalenceTest.cpp b/clang/unittests/AST/StructuralEquivalenceTest.cpp
index 952c83be0cb64..e994086c99d04 100644
--- a/clang/unittests/AST/StructuralEquivalenceTest.cpp
+++ b/clang/unittests/AST/StructuralEquivalenceTest.cpp
@@ -1109,6 +1109,20 @@ TEST_F(StructuralEquivalenceEnumTest, EnumsWithDifferentBody) {
   EXPECT_FALSE(testStructuralMatch(t));
 }
 
+TEST_F(StructuralEquivalenceEnumTest, AnonymousEnumsWithSameConsts) {
+  // field x is required to trigger comparison of the anonymous enum
+  auto t = makeNamedDecls("struct foo { enum { A } x; };",
+                          "struct foo { enum { A } x;};", Lang_CXX11);
+  EXPECT_TRUE(testStructuralMatch(t));
+}
+
+TEST_F(StructuralEquivalenceEnumTest, AnonymousEnumsWithDiffConsts) {
+  // field x is required to trigger comparison of the anonymous enum
+  auto t = makeNamedDecls("struct foo { enum { A } x; };",
+                          "struct foo { enum { B } x;};", Lang_CXX11);
+  EXPECT_FALSE(testStructuralMatch(t));
+}
+
 struct StructuralEquivalenceEnumConstantTest : StructuralEquivalenceTest {};
 
 TEST_F(StructuralEquivalenceEnumConstantTest, EnumConstantsWithSameValues) {

>From 4d51e83728d17d14e25ab0f36006badb21eea3e3 Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg at google.com>
Date: Thu, 25 Jul 2024 22:18:05 -0400
Subject: [PATCH 227/259] [mlir] fixes for f6431f0c5268

---
 mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h | 1 +
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
index aca25caba4dfc..6be5548fdb60e 100644
--- a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
@@ -9,6 +9,7 @@
 #define MLIR_CONVERSION_GPUCOMMON_OPTOFUNCCALLLOWERING_H_
 
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Builders.h"
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 536c8256b4274..cb67eb890b7ad 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -5744,6 +5744,7 @@ cc_library(
         "lib/Conversion/GPUCommon/OpToFuncCallLowering.h",
     ],
     deps = [
+        ":ArithDialect",
         ":GPUDialect",
         ":IR",
         ":LLVMCommonConversion",

>From 6f65a397858cb9ed598db7433f332e5bc1862637 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 26 Jul 2024 10:32:55 +0800
Subject: [PATCH 228/259] [RISCV] Update RISCVVectorPeephole pass name

It was previously called RISCVFoldMasks
---
 llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp | 4 +++-
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index f328c55e1d3ba..20c014a6782a9 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -56,7 +56,9 @@ class RISCVVectorPeephole : public MachineFunctionPass {
         MachineFunctionProperties::Property::IsSSA);
   }
 
-  StringRef getPassName() const override { return "RISC-V Fold Masks"; }
+  StringRef getPassName() const override {
+    return "RISC-V Vector Peephole Optimization";
+  }
 
 private:
   bool convertToVLMAX(MachineInstr &MI) const;
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 3611d92826235..e305a74d7aef3 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -90,7 +90,7 @@
 ; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       RISC-V DAG->DAG Pattern Instruction Selection
 ; CHECK-NEXT:       Finalize ISel and expand pseudo-instructions
-; CHECK-NEXT:       RISC-V Fold Masks
+; CHECK-NEXT:       RISC-V Vector Peephole Optimization
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Early Tail Duplication
 ; CHECK-NEXT:       Optimize machine instruction PHIs

>From f4be6812e203690073280b9ac8d60092d75bbdce Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg at google.com>
Date: Thu, 25 Jul 2024 22:52:31 -0400
Subject: [PATCH 229/259] [mlir][bazel] fix for c194bc77a21d

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index cb67eb890b7ad..f83c4715408ba 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -2142,6 +2142,7 @@ cc_library(
         ":FuncTransforms",
         ":FunctionInterfaces",
         ":IR",
+        ":IndexDialect",
         ":LLVMCommonConversion",
         ":LLVMDialect",
         ":MemRefDialect",
@@ -2149,6 +2150,7 @@ cc_library(
         ":SCFDialect",
         ":SCFTransforms",
         ":TransformUtils",
+        ":VectorUtils",
         "//llvm:Support",
     ],
 )

>From 70a9535f714e2fdb84af243a1a316012c8019049 Mon Sep 17 00:00:00 2001
From: Alexander Shaposhnikov <ashaposhnikov at google.com>
Date: Thu, 25 Jul 2024 21:34:08 -0700
Subject: [PATCH 230/259] [Instrumentation][nsan] Add support for Freeze
 instruction (#100490)

Add support for Freeze.

This fixes https://github.com/llvm/llvm-project/issues/98143 .
---
 .../NumericalStabilitySanitizer.cpp           |  3 +++
 .../NumericalStabilitySanitizer/basic.ll      | 23 +++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
index b382fedde027b..832506f639a74 100644
--- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
@@ -1725,6 +1725,9 @@ Value *NumericalStabilitySanitizer::createShadowValueWithOperandsAvailable(
                                 Map.getShadow(S->getTrueValue()),
                                 Map.getShadow(S->getFalseValue()));
 
+  if (auto *Freeze = dyn_cast<FreezeInst>(&Inst))
+    return Builder.CreateFreeze(Map.getShadow(Freeze->getOperand(0)));
+
   if (auto *Extract = dyn_cast<ExtractElementInst>(&Inst))
     return Builder.CreateExtractElement(
         Map.getShadow(Extract->getVectorOperand()), Extract->getIndexOperand());
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
index 8110dd485d369..5da68320d91f9 100644
--- a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
@@ -737,6 +737,29 @@ entry:
   ret void
 }
 
+define void @freeze_vector_insert(<2 x float> %vec, i32 %idx, float %scalar) sanitize_numerical_stability {
+; CHECK-LABEL: @freeze_vector_insert(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (ptr @freeze_vector_insert to i64)
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr @__nsan_shadow_args_ptr, align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext <2 x float> [[VEC:%.*]] to <2 x double>
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr getelementptr ([16384 x i8], ptr @__nsan_shadow_args_ptr, i64 0, i64 16), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fpext float [[SCALAR:%.*]] to double
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP1]], double [[TMP5]], double [[TMP6]]
+; CHECK-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[VEC]], float [[SCALAR]], i32 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP7]], i32 [[IDX]]
+; CHECK-NEXT:    [[FROZEN:%.*]] = freeze <2 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = freeze <2 x double> [[TMP9]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = insertelement <2 x float> %vec, float %scalar, i32 %idx
+  %frozen = freeze <2 x float> %1
+  ret void
+}
 
 define void @vector_shuffle(<2 x float> %0) sanitize_numerical_stability {
 ; CHECK-LABEL: @vector_shuffle(

>From e108853ac8fad27ff22be9303c87d90bcdf0ef53 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 26 Jul 2024 09:57:28 +0400
Subject: [PATCH 231/259] clang: Allow targets to set custom metadata on
 atomics (#96906)

Use this to replace the emission of the amdgpu-unsafe-fp-atomics
attribute in favor of per-instruction metadata. In the future
new fine grained controls should be introduced that also cover
the integer cases.

Add a wrapper around CreateAtomicRMW that appends the metadata,
and update a few use contexts to use it.
---
 clang/lib/CodeGen/CGAtomic.cpp                |  13 +-
 clang/lib/CodeGen/CGExprScalar.cpp            |  13 +-
 clang/lib/CodeGen/CGStmtOpenMP.cpp            |   4 +-
 clang/lib/CodeGen/CodeGenFunction.h           |   7 +
 clang/lib/CodeGen/TargetInfo.h                |   4 +
 clang/lib/CodeGen/Targets/AMDGPU.cpp          |  19 ++
 .../test/CodeGen/AMDGPU/amdgpu-atomic-float.c | 316 ++++++++++++++++++
 clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu   | 102 ++++--
 .../test/OpenMP/amdgpu-unsafe-fp-atomics.cpp  |  59 ++++
 9 files changed, 505 insertions(+), 32 deletions(-)
 create mode 100644 clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c
 create mode 100644 clang/test/OpenMP/amdgpu-unsafe-fp-atomics.cpp

diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
index fbf942d06ca6e..fbe9569e50ef6 100644
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -727,7 +727,7 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, Address Dest,
 
   llvm::Value *LoadVal1 = CGF.Builder.CreateLoad(Val1);
   llvm::AtomicRMWInst *RMWI =
-      CGF.Builder.CreateAtomicRMW(Op, Ptr, LoadVal1, Order, Scope);
+      CGF.emitAtomicRMWInst(Op, Ptr, LoadVal1, Order, Scope);
   RMWI->setVolatile(E->isVolatile());
 
   // For __atomic_*_fetch operations, perform the operation again to
@@ -2034,6 +2034,17 @@ std::pair<RValue, llvm::Value *> CodeGenFunction::EmitAtomicCompareExchange(
                                            IsWeak);
 }
 
+llvm::AtomicRMWInst *
+CodeGenFunction::emitAtomicRMWInst(llvm::AtomicRMWInst::BinOp Op, Address Addr,
+                                   llvm::Value *Val, llvm::AtomicOrdering Order,
+                                   llvm::SyncScope::ID SSID) {
+
+  llvm::AtomicRMWInst *RMW =
+      Builder.CreateAtomicRMW(Op, Addr, Val, Order, SSID);
+  getTargetHooks().setTargetAtomicMetadata(*this, *RMW);
+  return RMW;
+}
+
 void CodeGenFunction::EmitAtomicUpdate(
     LValue LVal, llvm::AtomicOrdering AO,
     const llvm::function_ref<RValue(RValue)> &UpdateOp, bool IsVolatile) {
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index a17d68424bbce..84392745ea614 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2841,9 +2841,10 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
           isInc ? llvm::Instruction::FAdd : llvm::Instruction::FSub;
       llvm::Value *amt = llvm::ConstantFP::get(
           VMContext, llvm::APFloat(static_cast<float>(1.0)));
-      llvm::Value *old =
-          Builder.CreateAtomicRMW(aop, LV.getAddress(), amt,
-                                  llvm::AtomicOrdering::SequentiallyConsistent);
+      llvm::AtomicRMWInst *old =
+          CGF.emitAtomicRMWInst(aop, LV.getAddress(), amt,
+                                llvm::AtomicOrdering::SequentiallyConsistent);
+
       return isPre ? Builder.CreateBinOp(op, old, amt) : old;
     }
     value = EmitLoadOfLValue(LV, E->getExprLoc());
@@ -3583,9 +3584,9 @@ LValue ScalarExprEmitter::EmitCompoundAssignLValue(
             EmitScalarConversion(OpInfo.RHS, E->getRHS()->getType(), LHSTy,
                                  E->getExprLoc()),
             LHSTy);
-        Value *OldVal = Builder.CreateAtomicRMW(
-            AtomicOp, LHSLV.getAddress(), Amt,
-            llvm::AtomicOrdering::SequentiallyConsistent);
+
+        llvm::AtomicRMWInst *OldVal =
+            CGF.emitAtomicRMWInst(AtomicOp, LHSLV.getAddress(), Amt);
 
         // Since operation is atomic, the result type is guaranteed to be the
         // same as the input in LLVM terms.
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 853046bf43495..4ee9840f12e06 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -6326,8 +6326,8 @@ static std::pair<bool, RValue> emitOMPAtomicRMW(CodeGenFunction &CGF, LValue X,
       UpdateVal = CGF.Builder.CreateCast(llvm::Instruction::CastOps::UIToFP, IC,
                                          X.getAddress().getElementType());
   }
-  llvm::Value *Res =
-      CGF.Builder.CreateAtomicRMW(RMWOp, X.getAddress(), UpdateVal, AO);
+  llvm::AtomicRMWInst *Res =
+      CGF.emitAtomicRMWInst(RMWOp, X.getAddress(), UpdateVal, AO);
   return std::make_pair(true, RValue::get(Res));
 }
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 67e3019565cd0..bd62c65d8cce6 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4160,6 +4160,13 @@ class CodeGenFunction : public CodeGenTypeCache {
           llvm::AtomicOrdering::SequentiallyConsistent,
       bool IsWeak = false, AggValueSlot Slot = AggValueSlot::ignored());
 
+  /// Emit an atomicrmw instruction, and applying relevant metadata when
+  /// applicable.
+  llvm::AtomicRMWInst *emitAtomicRMWInst(
+      llvm::AtomicRMWInst::BinOp Op, Address Addr, llvm::Value *Val,
+      llvm::AtomicOrdering Order = llvm::AtomicOrdering::SequentiallyConsistent,
+      llvm::SyncScope::ID SSID = llvm::SyncScope::System);
+
   void EmitAtomicUpdate(LValue LVal, llvm::AtomicOrdering AO,
                         const llvm::function_ref<RValue(RValue)> &UpdateOp,
                         bool IsVolatile);
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
index 2f2138582ba1e..8f17c053f4783 100644
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -334,6 +334,10 @@ class TargetCodeGenInfo {
                                                  llvm::AtomicOrdering Ordering,
                                                  llvm::LLVMContext &Ctx) const;
 
+  /// Allow the target to apply other metadata to an atomic instruction
+  virtual void setTargetAtomicMetadata(CodeGenFunction &CGF,
+                                       llvm::AtomicRMWInst &RMW) const {}
+
   /// Interface class for filling custom fields of a block literal for OpenCL.
   class TargetOpenCLBlockHelper {
   public:
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 4d3275e17c386..37e6af3d4196a 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -311,6 +311,8 @@ class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
                                          SyncScope Scope,
                                          llvm::AtomicOrdering Ordering,
                                          llvm::LLVMContext &Ctx) const override;
+  void setTargetAtomicMetadata(CodeGenFunction &CGF,
+                               llvm::AtomicRMWInst &RMW) const override;
   llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
                                          llvm::Function *BlockInvokeFunc,
                                          llvm::Type *BlockTy) const override;
@@ -546,6 +548,23 @@ AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
   return Ctx.getOrInsertSyncScopeID(Name);
 }
 
+void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata(
+    CodeGenFunction &CGF, llvm::AtomicRMWInst &RMW) const {
+  if (!CGF.getTarget().allowAMDGPUUnsafeFPAtomics())
+    return;
+
+  // TODO: Introduce new, more controlled options that also work for integers,
+  // and deprecate allowAMDGPUUnsafeFPAtomics.
+  llvm::AtomicRMWInst::BinOp RMWOp = RMW.getOperation();
+  if (llvm::AtomicRMWInst::isFPOperation(RMWOp)) {
+    llvm::MDNode *Empty = llvm::MDNode::get(CGF.getLLVMContext(), {});
+    RMW.setMetadata("amdgpu.no.fine.grained.memory", Empty);
+
+    if (RMWOp == llvm::AtomicRMWInst::FAdd && RMW.getType()->isFloatTy())
+      RMW.setMetadata("amdgpu.ignore.denormal.mode", Empty);
+  }
+}
+
 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
   return false;
 }
diff --git a/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c b/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c
new file mode 100644
index 0000000000000..6deff1116e1d8
--- /dev/null
+++ b/clang/test/CodeGen/AMDGPU/amdgpu-atomic-float.c
@@ -0,0 +1,316 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -fnative-half-arguments-and-returns -triple amdgcn-amd-amdhsa-gnu -target-cpu gfx900 -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,SAFE %s
+// RUN: %clang_cc1 -fnative-half-arguments-and-returns -triple amdgcn-amd-amdhsa-gnu -target-cpu gfx900 -emit-llvm -munsafe-fp-atomics -o - %s | FileCheck -check-prefixes=CHECK,UNSAFE %s
+
+// SAFE-LABEL: define dso_local float @test_float_post_inc(
+// SAFE-SAME: ) #[[ATTR0:[0-9]+]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_float_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 4
+// SAFE-NEXT:    ret float [[TMP0]]
+//
+// UNSAFE-LABEL: define dso_local float @test_float_post_inc(
+// UNSAFE-SAME: ) #[[ATTR0:[0-9]+]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_float_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT:    ret float [[TMP0]]
+//
+float test_float_post_inc()
+{
+    static _Atomic float n;
+    return n++;
+}
+
+// SAFE-LABEL: define dso_local float @test_float_post_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_float_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 4
+// SAFE-NEXT:    ret float [[TMP0]]
+//
+// UNSAFE-LABEL: define dso_local float @test_float_post_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_float_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    ret float [[TMP0]]
+//
+float test_float_post_dc()
+{
+    static _Atomic float n;
+    return n--;
+}
+
+// SAFE-LABEL: define dso_local float @test_float_pre_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_float_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 4
+// SAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    ret float [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local float @test_float_pre_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_float_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    ret float [[TMP1]]
+//
+float test_float_pre_dc()
+{
+    static _Atomic float n;
+    return --n;
+}
+
+// SAFE-LABEL: define dso_local float @test_float_pre_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_float_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 4
+// SAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    ret float [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local float @test_float_pre_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_float_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    ret float [[TMP1]]
+//
+float test_float_pre_inc()
+{
+    static _Atomic float n;
+    return ++n;
+}
+
+// SAFE-LABEL: define dso_local double @test_double_post_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    ret double [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local double @test_double_post_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    ret double [[TMP1]]
+//
+double test_double_post_inc()
+{
+    static _Atomic double n;
+    return n++;
+}
+
+// SAFE-LABEL: define dso_local double @test_double_post_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    ret double [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local double @test_double_post_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    ret double [[TMP1]]
+//
+double test_double_post_dc()
+{
+    static _Atomic double n;
+    return n--;
+}
+
+// SAFE-LABEL: define dso_local double @test_double_pre_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    ret double [[TMP2]]
+//
+// UNSAFE-LABEL: define dso_local double @test_double_pre_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test_double_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    ret double [[TMP2]]
+//
+double test_double_pre_dc()
+{
+    static _Atomic double n;
+    return --n;
+}
+
+// SAFE-LABEL: define dso_local double @test_double_pre_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 8
+// SAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// SAFE-NEXT:    ret double [[TMP2]]
+//
+// UNSAFE-LABEL: define dso_local double @test_double_pre_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca double, align 8, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test_double_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL_ASCAST]], align 8
+// UNSAFE-NEXT:    ret double [[TMP2]]
+//
+double test_double_pre_inc()
+{
+    static _Atomic double n;
+    return ++n;
+}
+
+// SAFE-LABEL: define dso_local half @test__Float16_post_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 2
+// SAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    ret half [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local half @test__Float16_post_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_post_inc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    ret half [[TMP1]]
+//
+_Float16 test__Float16_post_inc()
+{
+    static _Atomic _Float16 n;
+    return n++;
+}
+
+// SAFE-LABEL: define dso_local half @test__Float16_post_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 2
+// SAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    ret half [[TMP1]]
+//
+// UNSAFE-LABEL: define dso_local half @test__Float16_post_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_post_dc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    store float [[TMP0]], ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    [[TMP1:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    ret half [[TMP1]]
+//
+_Float16 test__Float16_post_dc()
+{
+    static _Atomic _Float16 n;
+    return n--;
+}
+
+// SAFE-LABEL: define dso_local half @test__Float16_pre_dc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 2
+// SAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    ret half [[TMP2]]
+//
+// UNSAFE-LABEL: define dso_local half @test__Float16_pre_dc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_dc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    ret half [[TMP2]]
+//
+_Float16 test__Float16_pre_dc()
+{
+    static _Atomic _Float16 n;
+    return --n;
+}
+
+// SAFE-LABEL: define dso_local half @test__Float16_pre_inc(
+// SAFE-SAME: ) #[[ATTR0]] {
+// SAFE-NEXT:  [[ENTRY:.*:]]
+// SAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// SAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// SAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 2
+// SAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// SAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// SAFE-NEXT:    ret half [[TMP2]]
+//
+// UNSAFE-LABEL: define dso_local half @test__Float16_pre_inc(
+// UNSAFE-SAME: ) #[[ATTR0]] {
+// UNSAFE-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-NEXT:    [[RETVAL:%.*]] = alloca half, align 2, addrspace(5)
+// UNSAFE-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// UNSAFE-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @test__Float16_pre_inc.n to ptr), float 1.000000e+00 seq_cst, align 2, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
+// UNSAFE-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// UNSAFE-NEXT:    store float [[TMP1]], ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    [[TMP2:%.*]] = load half, ptr [[RETVAL_ASCAST]], align 2
+// UNSAFE-NEXT:    ret half [[TMP2]]
+//
+_Float16 test__Float16_pre_inc()
+{
+    static _Atomic _Float16 n;
+    return ++n;
+}
+//.
+// UNSAFE: [[META3]] = !{}
+//.
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// CHECK: {{.*}}
diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
index eeb23bc7e1c01..55ddb52da311e 100644
--- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
@@ -1,6 +1,10 @@
 // RUN: %clang_cc1 -x hip %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa \
 // RUN:   -fcuda-is-device -target-cpu gfx906 -fnative-half-type \
-// RUN:   -fnative-half-arguments-and-returns | FileCheck %s
+// RUN:   -fnative-half-arguments-and-returns | FileCheck -check-prefixes=CHECK,SAFEIR %s
+
+// RUN: %clang_cc1 -x hip %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa \
+// RUN:   -fcuda-is-device -target-cpu gfx906 -fnative-half-type \
+// RUN:   -fnative-half-arguments-and-returns -munsafe-fp-atomics | FileCheck -check-prefixes=CHECK,UNSAFEIR %s
 
 // RUN: %clang_cc1 -x hip %s -O3 -S -o - -triple=amdgcn-amd-amdhsa \
 // RUN:   -fcuda-is-device -target-cpu gfx1100 -fnative-half-type \
@@ -18,24 +22,38 @@
 
 __global__ void ffp1(float *p) {
   // CHECK-LABEL: @_Z4ffp1Pf
-  // CHECK: atomicrmw fadd ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
+  // SAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 4{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 4{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 4{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4{{$}}
+
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
   // SAFE: _Z4ffp1Pf
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
+  // SAFE: global_atomic_cmpswap
+
   // UNSAFE: _Z4ffp1Pf
   // UNSAFE: global_atomic_add_f32
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
+  // UNSAFE: global_atomic_cmpswap
+
   __atomic_fetch_add(p, 1.0f, memory_order_relaxed);
+  __atomic_fetch_sub(p, 1.0f, memory_order_relaxed);
   __atomic_fetch_max(p, 1.0f, memory_order_relaxed);
   __atomic_fetch_min(p, 1.0f, memory_order_relaxed);
   __hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
@@ -44,23 +62,36 @@ __global__ void ffp1(float *p) {
 
 __global__ void ffp2(double *p) {
   // CHECK-LABEL: @_Z4ffp2Pd
-  // CHECK: atomicrmw fsub ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
+  // SAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}}
+
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
   // SAFE-LABEL: @_Z4ffp2Pd
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
+  // SAFE: global_atomic_cmpswap_b64
+
   // UNSAFE-LABEL: @_Z4ffp2Pd
+  // UNSAFE: global_atomic_add_f64
   // UNSAFE: global_atomic_cmpswap_x2
   // UNSAFE: global_atomic_cmpswap_x2
   // UNSAFE: global_atomic_cmpswap_x2
   // UNSAFE: global_atomic_max_f64
   // UNSAFE: global_atomic_min_f64
+  __atomic_fetch_add(p, 1.0, memory_order_relaxed);
   __atomic_fetch_sub(p, 1.0, memory_order_relaxed);
   __atomic_fetch_max(p, 1.0, memory_order_relaxed);
   __atomic_fetch_min(p, 1.0, memory_order_relaxed);
@@ -71,11 +102,20 @@ __global__ void ffp2(double *p) {
 // long double is the same as double for amdgcn.
 __global__ void ffp3(long double *p) {
   // CHECK-LABEL: @_Z4ffp3Pe
-  // CHECK: atomicrmw fsub ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
+  // SAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}}
+
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
   // SAFE-LABEL: @_Z4ffp3Pe
   // SAFE: global_atomic_cmpswap_b64
   // SAFE: global_atomic_cmpswap_b64
@@ -88,6 +128,7 @@ __global__ void ffp3(long double *p) {
   // UNSAFE: global_atomic_cmpswap_x2
   // UNSAFE: global_atomic_max_f64
   // UNSAFE: global_atomic_min_f64
+  __atomic_fetch_add(p, 1.0L, memory_order_relaxed);
   __atomic_fetch_sub(p, 1.0L, memory_order_relaxed);
   __atomic_fetch_max(p, 1.0L, memory_order_relaxed);
   __atomic_fetch_min(p, 1.0L, memory_order_relaxed);
@@ -98,37 +139,52 @@ __global__ void ffp3(long double *p) {
 __device__ double ffp4(double *p, float f) {
   // CHECK-LABEL: @_Z4ffp4Pdf
   // CHECK: fpext float {{.*}} to double
-  // CHECK: atomicrmw fsub ptr {{.*}} monotonic
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   return __atomic_fetch_sub(p, f, memory_order_relaxed);
 }
 
 __device__ double ffp5(double *p, int i) {
   // CHECK-LABEL: @_Z4ffp5Pdi
   // CHECK: sitofp i32 {{.*}} to double
-  // CHECK: atomicrmw fsub ptr {{.*}} monotonic
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   return __atomic_fetch_sub(p, i, memory_order_relaxed);
 }
 
 __global__ void ffp6(_Float16 *p) {
   // CHECK-LABEL: @_Z4ffp6PDF16
-  // CHECK: atomicrmw fadd ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} monotonic
-  // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic
-  // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic
+  // SAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 2{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 2{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 2{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2{{$}}
+
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+
   // SAFE: _Z4ffp6PDF16
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
   // SAFE: global_atomic_cmpswap
+  // SAFE: global_atomic_cmpswap
+
   // UNSAFE: _Z4ffp6PDF16
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
   // UNSAFE: global_atomic_cmpswap
+  // UNSAFE: global_atomic_cmpswap
   __atomic_fetch_add(p, 1.0, memory_order_relaxed);
+  __atomic_fetch_sub(p, 1.0, memory_order_relaxed);
   __atomic_fetch_max(p, 1.0, memory_order_relaxed);
   __atomic_fetch_min(p, 1.0, memory_order_relaxed);
   __hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
diff --git a/clang/test/OpenMP/amdgpu-unsafe-fp-atomics.cpp b/clang/test/OpenMP/amdgpu-unsafe-fp-atomics.cpp
new file mode 100644
index 0000000000000..7a34113cec8fa
--- /dev/null
+++ b/clang/test/OpenMP/amdgpu-unsafe-fp-atomics.cpp
@@ -0,0 +1,59 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck -check-prefix=DEFAULT %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -munsafe-fp-atomics -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck -check-prefix=UNSAFE-FP-ATOMICS %s
+
+#pragma omp declare target
+
+float fv, fx;
+double dv, dx;
+
+// DEFAULT-LABEL: define hidden void @_Z15atomic_fadd_f32v(
+// DEFAULT-SAME: ) #[[ATTR0:[0-9]+]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr addrspacecast (ptr addrspace(1) @fv to ptr), align 4
+// DEFAULT-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @fx to ptr), float [[TMP0]] monotonic, align 4
+// DEFAULT-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[TMP0]]
+// DEFAULT-NEXT:    store float [[ADD]], ptr addrspacecast (ptr addrspace(1) @fv to ptr), align 4
+// DEFAULT-NEXT:    ret void
+//
+// UNSAFE-FP-ATOMICS-LABEL: define hidden void @_Z15atomic_fadd_f32v(
+// UNSAFE-FP-ATOMICS-SAME: ) #[[ATTR0:[0-9]+]] {
+// UNSAFE-FP-ATOMICS-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-FP-ATOMICS-NEXT:    [[TMP0:%.*]] = load float, ptr addrspacecast (ptr addrspace(1) @fv to ptr), align 4
+// UNSAFE-FP-ATOMICS-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @fx to ptr), float [[TMP0]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META5:![0-9]+]], !amdgpu.ignore.denormal.mode [[META5]]
+// UNSAFE-FP-ATOMICS-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[TMP0]]
+// UNSAFE-FP-ATOMICS-NEXT:    store float [[ADD]], ptr addrspacecast (ptr addrspace(1) @fv to ptr), align 4
+// UNSAFE-FP-ATOMICS-NEXT:    ret void
+//
+void atomic_fadd_f32() {
+#pragma omp atomic capture
+  fv = fx = fx + fv;
+}
+
+// DEFAULT-LABEL: define hidden void @_Z15atomic_fadd_f64v(
+// DEFAULT-SAME: ) #[[ATTR0]] {
+// DEFAULT-NEXT:  [[ENTRY:.*:]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr addrspacecast (ptr addrspace(1) @dv to ptr), align 8
+// DEFAULT-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @dx to ptr), double [[TMP0]] monotonic, align 8
+// DEFAULT-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[TMP0]]
+// DEFAULT-NEXT:    store double [[ADD]], ptr addrspacecast (ptr addrspace(1) @dv to ptr), align 8
+// DEFAULT-NEXT:    ret void
+//
+// UNSAFE-FP-ATOMICS-LABEL: define hidden void @_Z15atomic_fadd_f64v(
+// UNSAFE-FP-ATOMICS-SAME: ) #[[ATTR0]] {
+// UNSAFE-FP-ATOMICS-NEXT:  [[ENTRY:.*:]]
+// UNSAFE-FP-ATOMICS-NEXT:    [[TMP0:%.*]] = load double, ptr addrspacecast (ptr addrspace(1) @dv to ptr), align 8
+// UNSAFE-FP-ATOMICS-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @dx to ptr), double [[TMP0]] monotonic, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// UNSAFE-FP-ATOMICS-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[TMP0]]
+// UNSAFE-FP-ATOMICS-NEXT:    store double [[ADD]], ptr addrspacecast (ptr addrspace(1) @dv to ptr), align 8
+// UNSAFE-FP-ATOMICS-NEXT:    ret void
+//
+void atomic_fadd_f64() {
+#pragma omp atomic capture
+  dv = dx = dx + dv;
+}
+
+#pragma omp end declare target
+//.
+// UNSAFE-FP-ATOMICS: [[META5]] = !{}
+//.

>From 63d088c6e46122bc776f89ded4f285feaab69ae6 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 26 Jul 2024 10:01:34 +0400
Subject: [PATCH 232/259] FastISel: Do not check for module debug info
 (#100353)

I don't see the point of this check and SelectionDAG
does not perform it. In the normal usecase, if there's
no debug info the debug intrinsics would not be there
in the first place.
---
 llvm/lib/CodeGen/SelectionDAG/FastISel.cpp | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index ef9f783355190..e255bbaa92b16 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1203,11 +1203,6 @@ void FastISel::handleDbgInfo(const Instruction *II) {
 
     if (DbgLabelRecord *DLR = dyn_cast<DbgLabelRecord>(&DR)) {
       assert(DLR->getLabel() && "Missing label");
-      if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
-        LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DLR << "\n");
-        continue;
-      }
-
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DLR->getDebugLoc(),
               TII.get(TargetOpcode::DBG_LABEL))
           .addMetadata(DLR->getLabel());
@@ -1402,12 +1397,6 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
     assert(DI->getVariable() && "Missing variable");
-    if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
-      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI
-                        << " (!hasDebugInfo)\n");
-      return true;
-    }
-
     if (FuncInfo.PreprocessedDbgDeclares.contains(DI))
       return true;
 
@@ -1446,11 +1435,6 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
   case Intrinsic::dbg_label: {
     const DbgLabelInst *DI = cast<DbgLabelInst>(II);
     assert(DI->getLabel() && "Missing label");
-    if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
-      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
-      return true;
-    }
-
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
             TII.get(TargetOpcode::DBG_LABEL)).addMetadata(DI->getLabel());
     return true;

>From e27358c8ed7abac200546e808ea30a86aa9aa580 Mon Sep 17 00:00:00 2001
From: wanglei <wanglei at loongson.cn>
Date: Fri, 26 Jul 2024 14:36:54 +0800
Subject: [PATCH 233/259] [LoongArch][MC] Support %[ld_/gd_/desc_]pcrel_20

Reviewed By: SixWeining, MaskRay

Pull Request: https://github.com/llvm/llvm-project/pull/100104
---
 .../AsmParser/LoongArchAsmParser.cpp          | 24 +++++++++++++++++++
 .../Target/LoongArch/LoongArchInstrInfo.td    |  6 ++++-
 .../MCTargetDesc/LoongArchFixupKinds.h        |  8 +++++++
 .../MCTargetDesc/LoongArchMCCodeEmitter.cpp   | 12 ++++++++++
 .../MCTargetDesc/LoongArchMCExpr.cpp          | 15 ++++++++++++
 .../LoongArch/MCTargetDesc/LoongArchMCExpr.h  |  4 ++++
 .../test/MC/LoongArch/Basic/Integer/invalid.s |  6 +++--
 .../MC/LoongArch/Relocations/relocations.s    | 20 ++++++++++++++++
 8 files changed, 92 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index 208bd3db8f14e..f52e188f87792 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -450,6 +450,24 @@ class LoongArchOperand : public MCParsedAsmOperand {
                      IsValidKind;
   }
 
+  bool isSImm20pcaddi() const {
+    if (!isImm())
+      return false;
+
+    int64_t Imm;
+    LoongArchMCExpr::VariantKind VK = LoongArchMCExpr::VK_LoongArch_None;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    bool IsValidKind = VK == LoongArchMCExpr::VK_LoongArch_None ||
+                       VK == LoongArchMCExpr::VK_LoongArch_PCREL20_S2 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_LD_PCREL20_S2 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_GD_PCREL20_S2 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_DESC_PCREL20_S2;
+    return IsConstantImm
+               ? isInt<20>(Imm) && IsValidKind
+               : LoongArchAsmParser::classifySymbolRef(getImm(), VK) &&
+                     IsValidKind;
+  }
+
   bool isSImm21lsl2() const {
     if (!isImm())
       return false;
@@ -1676,6 +1694,12 @@ bool LoongArchAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
         /*Upper=*/(1 << 19) - 1,
         "operand must be a symbol with modifier (e.g. %call36) or an integer "
         "in the range");
+  case Match_InvalidSImm20pcaddi:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, /*Lower=*/-(1 << 19),
+        /*Upper=*/(1 << 19) - 1,
+        "operand must be a symbol with modifier (e.g. %pcrel_20) or an integer "
+        "in the range");
   case Match_InvalidSImm21lsl2:
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, /*Lower=*/-(1 << 22), /*Upper=*/(1 << 22) - 4,
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index ec0d071453c3f..ef647a4277873 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -397,6 +397,10 @@ def simm20_pcaddu18i : SImm20Operand {
   let ParserMatchClass = SImmAsmOperand<20, "pcaddu18i">;
 }
 
+def simm20_pcaddi : SImm20Operand {
+  let ParserMatchClass = SImmAsmOperand<20, "pcaddi">;
+}
+
 def simm21_lsl2 : Operand<OtherVT> {
   let ParserMatchClass = SImmAsmOperand<21, "lsl2">;
   let EncoderMethod = "getImmOpValueAsr<2>";
@@ -754,7 +758,7 @@ def SLT  : ALU_3R<0x00120000>;
 def SLTU : ALU_3R<0x00128000>;
 def SLTI  : ALU_2RI12<0x02000000, simm12>;
 def SLTUI : ALU_2RI12<0x02400000, simm12>;
-def PCADDI    : ALU_1RI20<0x18000000, simm20>;
+def PCADDI    : ALU_1RI20<0x18000000, simm20_pcaddi>;
 def PCADDU12I : ALU_1RI20<0x1c000000, simm20>;
 def PCALAU12I : ALU_1RI20<0x1a000000, simm20_pcalau12i>;
 def AND  : ALU_3R<0x00148000>;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
index 29ed14f6e4d62..370f5b0189b51 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
@@ -111,6 +111,8 @@ enum Fixups {
   fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX,
   // Generate an R_LARCH_ALIGN which indicates the linker may fixup align here.
   fixup_loongarch_align = FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN,
+  // 20-bit fixup corresponding to %pcrel_20(foo) for instruction pcaddi.
+  fixup_loongarch_pcrel20_s2,
   // 36-bit fixup corresponding to %call36(foo) for a pair instructions:
   // pcaddu18i+jirl.
   fixup_loongarch_call36 = FirstLiteralRelocationKind + ELF::R_LARCH_CALL36,
@@ -142,6 +144,12 @@ enum Fixups {
   fixup_loongarch_tls_le_add_r,
   // 12-bit fixup corresponding to %le_lo12_r(foo) for instruction addi.w/d.
   fixup_loongarch_tls_le_lo12_r,
+  // 20-bit fixup corresponding to %ld_pcrel_20(foo) for instruction pcaddi.
+  fixup_loongarch_tls_ld_pcrel20_s2,
+  // 20-bit fixup corresponding to %gd_pcrel_20(foo) for instruction pcaddi.
+  fixup_loongarch_tls_gd_pcrel20_s2,
+  // 20-bit fixup corresponding to %desc_pcrel_20(foo) for instruction pcaddi.
+  fixup_loongarch_tls_desc_pcrel20_s2,
 };
 } // end namespace LoongArch
 } // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index efbfce35dbfca..4f7f93fa4783e 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -287,6 +287,18 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
     case LoongArchMCExpr::VK_LoongArch_TLS_LE_LO12_R:
       FixupKind = LoongArch::fixup_loongarch_tls_le_lo12_r;
       break;
+    case LoongArchMCExpr::VK_LoongArch_PCREL20_S2:
+      FixupKind = LoongArch::fixup_loongarch_pcrel20_s2;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_LD_PCREL20_S2:
+      FixupKind = LoongArch::fixup_loongarch_tls_ld_pcrel20_s2;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_GD_PCREL20_S2:
+      FixupKind = LoongArch::fixup_loongarch_tls_gd_pcrel20_s2;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_DESC_PCREL20_S2:
+      FixupKind = LoongArch::fixup_loongarch_tls_desc_pcrel20_s2;
+      break;
     }
   } else if (Kind == MCExpr::SymbolRef &&
              cast<MCSymbolRefExpr>(Expr)->getKind() ==
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
index 98b9b1ab6d3f4..53d46cca913e5 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
@@ -166,6 +166,14 @@ StringRef LoongArchMCExpr::getVariantKindName(VariantKind Kind) {
     return "le_add_r";
   case VK_LoongArch_TLS_LE_LO12_R:
     return "le_lo12_r";
+  case VK_LoongArch_PCREL20_S2:
+    return "pcrel_20";
+  case VK_LoongArch_TLS_LD_PCREL20_S2:
+    return "ld_pcrel_20";
+  case VK_LoongArch_TLS_GD_PCREL20_S2:
+    return "gd_pcrel_20";
+  case VK_LoongArch_TLS_DESC_PCREL20_S2:
+    return "desc_pcrel_20";
   }
 }
 
@@ -222,6 +230,10 @@ LoongArchMCExpr::getVariantKindForName(StringRef name) {
       .Case("le_hi20_r", VK_LoongArch_TLS_LE_HI20_R)
       .Case("le_add_r", VK_LoongArch_TLS_LE_ADD_R)
       .Case("le_lo12_r", VK_LoongArch_TLS_LE_LO12_R)
+      .Case("pcrel_20", VK_LoongArch_PCREL20_S2)
+      .Case("ld_pcrel_20", VK_LoongArch_TLS_LD_PCREL20_S2)
+      .Case("gd_pcrel_20", VK_LoongArch_TLS_GD_PCREL20_S2)
+      .Case("desc_pcrel_20", VK_LoongArch_TLS_DESC_PCREL20_S2)
       .Default(VK_LoongArch_Invalid);
 }
 
@@ -264,6 +276,9 @@ void LoongArchMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
   case VK_LoongArch_TLS_GD_HI20:
   case VK_LoongArch_TLS_DESC_PC_HI20:
   case VK_LoongArch_TLS_DESC_HI20:
+  case VK_LoongArch_TLS_LD_PCREL20_S2:
+  case VK_LoongArch_TLS_GD_PCREL20_S2:
+  case VK_LoongArch_TLS_DESC_PCREL20_S2:
     break;
   }
   fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
index 1546d894d56ab..91215b863ccc2 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
@@ -75,6 +75,10 @@ class LoongArchMCExpr : public MCTargetExpr {
     VK_LoongArch_TLS_LE_HI20_R,
     VK_LoongArch_TLS_LE_ADD_R,
     VK_LoongArch_TLS_LE_LO12_R,
+    VK_LoongArch_PCREL20_S2,
+    VK_LoongArch_TLS_LD_PCREL20_S2,
+    VK_LoongArch_TLS_GD_PCREL20_S2,
+    VK_LoongArch_TLS_DESC_PCREL20_S2,
     VK_LoongArch_Invalid // Must be the last item.
   };
 
diff --git a/llvm/test/MC/LoongArch/Basic/Integer/invalid.s b/llvm/test/MC/LoongArch/Basic/Integer/invalid.s
index 958d5cab6f2f3..08a131d4d43f9 100644
--- a/llvm/test/MC/LoongArch/Basic/Integer/invalid.s
+++ b/llvm/test/MC/LoongArch/Basic/Integer/invalid.s
@@ -99,11 +99,13 @@ jirl $a0, $a0, 0x20000
 # CHECK: :[[#@LINE-1]]:16: error: operand must be a symbol with modifier (e.g. %b16) or an integer in the range [-131072, 131068]
 
 ## simm20
-pcaddi $a0, -0x80001
-# CHECK: :[[#@LINE-1]]:13: error: immediate must be an integer in the range [-524288, 524287]
 pcaddu12i $a0, 0x80000
 # CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-524288, 524287]
 
+## simm20_pcaddi
+pcaddi $a0, -0x80001
+# CHECK: :[[#@LINE-1]]:13: error: operand must be a symbol with modifier (e.g. %pcrel_20) or an integer in the range [-524288, 524287]
+
 ## simm20_lu12iw
 lu12i.w $a0, -0x80001
 # CHECK: :[[#@LINE-1]]:14: error: operand must be a symbol with modifier (e.g. %abs_hi20) or an integer in the range [-524288, 524287]
diff --git a/llvm/test/MC/LoongArch/Relocations/relocations.s b/llvm/test/MC/LoongArch/Relocations/relocations.s
index e83b67199e656..091dce200b7de 100644
--- a/llvm/test/MC/LoongArch/Relocations/relocations.s
+++ b/llvm/test/MC/LoongArch/Relocations/relocations.s
@@ -288,3 +288,23 @@ addi.d $t1, $a2, %le_lo12_r(foo)
 # RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0
 # INSTR: addi.d $t1, $a2, %le_lo12_r(foo)
 # FIXUP: fixup A - offset: 0, value: %le_lo12_r(foo), kind: FK_NONE
+
+pcaddi $t1, %pcrel_20(foo)
+# RELOC: R_LARCH_PCREL20_S2 foo 0x0
+# INSTR: pcaddi $t1, %pcrel_20(foo)
+# FIXUP: fixup A - offset: 0, value: %pcrel_20(foo), kind: FK_NONE
+
+pcaddi $t1, %ld_pcrel_20(foo)
+# RELOC: R_LARCH_TLS_LD_PCREL20_S2 foo 0x0
+# INSTR: pcaddi $t1, %ld_pcrel_20(foo)
+# FIXUP: fixup A - offset: 0, value: %ld_pcrel_20(foo), kind: FK_NONE
+
+pcaddi $t1, %gd_pcrel_20(foo)
+# RELOC: R_LARCH_TLS_GD_PCREL20_S2 foo 0x0
+# INSTR: pcaddi $t1, %gd_pcrel_20(foo)
+# FIXUP: fixup A - offset: 0, value: %gd_pcrel_20(foo), kind: FK_NONE
+
+pcaddi $t1, %desc_pcrel_20(foo)
+# RELOC: R_LARCH_TLS_DESC_PCREL20_S2 foo 0x0
+# INSTR: pcaddi $t1, %desc_pcrel_20(foo)
+# FIXUP: fixup A - offset: 0, value: %desc_pcrel_20(foo), kind: FK_NONE

>From 0057a969a2a397c1ba57e06b65a8bb56af2ce987 Mon Sep 17 00:00:00 2001
From: wanglei <wanglei at loongson.cn>
Date: Fri, 26 Jul 2024 14:38:36 +0800
Subject: [PATCH 234/259] [lld][ELF][LoongArch] Support
 R_LARCH_TLS_{LD,GD,DESC}_PCREL_S2

Reviewed By: MaskRay, SixWeining

Pull Request: https://github.com/llvm/llvm-project/pull/100105
---
 lld/ELF/Arch/LoongArch.cpp                  |  10 ++
 lld/ELF/Relocations.cpp                     |   3 +-
 lld/test/ELF/loongarch-tls-gd-pcrel20-s2.s  | 129 ++++++++++++++++++
 lld/test/ELF/loongarch-tls-ld-pcrel20-s2.s  |  82 +++++++++++
 lld/test/ELF/loongarch-tlsdesc-pcrel20-s2.s | 142 ++++++++++++++++++++
 5 files changed, 365 insertions(+), 1 deletion(-)
 create mode 100644 lld/test/ELF/loongarch-tls-gd-pcrel20-s2.s
 create mode 100644 lld/test/ELF/loongarch-tls-ld-pcrel20-s2.s
 create mode 100644 lld/test/ELF/loongarch-tlsdesc-pcrel20-s2.s

diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index 9466e8b1ce54d..db0bc6c760096 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -511,6 +511,12 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s,
     return R_TLSDESC;
   case R_LARCH_TLS_DESC_CALL:
     return R_TLSDESC_CALL;
+  case R_LARCH_TLS_LD_PCREL20_S2:
+    return R_TLSLD_PC;
+  case R_LARCH_TLS_GD_PCREL20_S2:
+    return R_TLSGD_PC;
+  case R_LARCH_TLS_DESC_PCREL20_S2:
+    return R_TLSDESC_PC;
 
   // Other known relocs that are explicitly unimplemented:
   //
@@ -557,7 +563,11 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
     write64le(loc, val);
     return;
 
+  // Relocs intended for `pcaddi`.
   case R_LARCH_PCREL20_S2:
+  case R_LARCH_TLS_LD_PCREL20_S2:
+  case R_LARCH_TLS_GD_PCREL20_S2:
+  case R_LARCH_TLS_DESC_PCREL20_S2:
     checkInt(loc, val, 22, rel);
     checkAlignment(loc, val, 4, rel);
     write32le(loc, setJ20(read32le(loc), val >> 2));
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 36857d72c647e..6ad5c3bf8f6e9 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1308,7 +1308,8 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
   // LoongArch does not yet implement transition from TLSDESC to LE/IE, so
   // generate TLSDESC dynamic relocation for the dynamic linker to handle.
   if (config->emachine == EM_LOONGARCH &&
-      oneof<R_LOONGARCH_TLSDESC_PAGE_PC, R_TLSDESC, R_TLSDESC_CALL>(expr)) {
+      oneof<R_LOONGARCH_TLSDESC_PAGE_PC, R_TLSDESC, R_TLSDESC_PC,
+            R_TLSDESC_CALL>(expr)) {
     if (expr != R_TLSDESC_CALL) {
       sym.setFlags(NEEDS_TLSDESC);
       c.addReloc({expr, type, offset, addend, &sym});
diff --git a/lld/test/ELF/loongarch-tls-gd-pcrel20-s2.s b/lld/test/ELF/loongarch-tls-gd-pcrel20-s2.s
new file mode 100644
index 0000000000000..d4d12b9d4a520
--- /dev/null
+++ b/lld/test/ELF/loongarch-tls-gd-pcrel20-s2.s
@@ -0,0 +1,129 @@
+# REQUIRES: loongarch
+# RUN: rm -rf %t && split-file %s %t
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 %t/a.s -o %t/a.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 %t/bc.s -o %t/bc.32.o
+# RUN: ld.lld -shared -soname=bc.so %t/bc.32.o -o %t/bc.32.so
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 %t/tga.s -o %t/tga.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 %t/a.s -o %t/a.64.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 %t/bc.s -o %t/bc.64.o
+# RUN: ld.lld -shared -soname=bc.so %t/bc.64.o -o %t/bc.64.so
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 %t/tga.s -o %t/tga.64.o
+
+## LA32 GD
+# RUN: ld.lld -shared %t/a.32.o %t/bc.32.o -o %t/gd.32.so
+# RUN: llvm-readobj -r %t/gd.32.so | FileCheck --check-prefix=GD32-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/gd.32.so | FileCheck --check-prefix=GD32 %s
+
+## LA32 GD -> LE
+# RUN: ld.lld %t/a.32.o %t/bc.32.o %t/tga.32.o -o %t/le.32
+# RUN: llvm-readelf -r %t/le.32 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le.32 | FileCheck --check-prefix=LE32-GOT %s
+# RUN: ld.lld -pie %t/a.32.o %t/bc.32.o %t/tga.32.o -o %t/le-pie.32
+# RUN: llvm-readelf -r %t/le-pie.32 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le-pie.32 | FileCheck --check-prefix=LE32-GOT %s
+
+## LA32 GD -> IE
+# RUN: ld.lld %t/a.32.o %t/bc.32.so %t/tga.32.o -o %t/ie.32
+# RUN: llvm-readobj -r %t/ie.32 | FileCheck --check-prefix=IE32-REL %s
+# RUN: llvm-readelf -x .got %t/ie.32 | FileCheck --check-prefix=IE32-GOT %s
+
+## LA64 GD
+# RUN: ld.lld -shared %t/a.64.o %t/bc.64.o -o %t/gd.64.so
+# RUN: llvm-readobj -r %t/gd.64.so | FileCheck --check-prefix=GD64-REL %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/gd.64.so | FileCheck --check-prefix=GD64 %s
+
+## LA64 GD -> LE
+# RUN: ld.lld %t/a.64.o %t/bc.64.o %t/tga.64.o -o %t/le.64
+# RUN: llvm-readelf -r %t/le.64 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le.64 | FileCheck --check-prefix=LE64-GOT %s
+# RUN: ld.lld -pie %t/a.64.o %t/bc.64.o %t/tga.64.o -o %t/le-pie.64
+# RUN: llvm-readelf -r %t/le-pie.64 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le-pie.64 | FileCheck --check-prefix=LE64-GOT %s
+
+## LA64 GD -> IE
+# RUN: ld.lld %t/a.64.o %t/bc.64.so %t/tga.64.o -o %t/ie.64
+# RUN: llvm-readobj -r %t/ie.64 | FileCheck --check-prefix=IE64-REL %s
+# RUN: llvm-readelf -x .got %t/ie.64 | FileCheck --check-prefix=IE64-GOT %s
+
+# GD32-REL:      .rela.dyn {
+# GD32-REL-NEXT:   0x20300 R_LARCH_TLS_DTPMOD32 a 0x0
+# GD32-REL-NEXT:   0x20304 R_LARCH_TLS_DTPREL32 a 0x0
+# GD32-REL-NEXT:   0x20308 R_LARCH_TLS_DTPMOD32 b 0x0
+# GD32-REL-NEXT:   0x2030C R_LARCH_TLS_DTPREL32 b 0x0
+# GD32-REL-NEXT: }
+
+## &DTPMOD(a) - . = 0x20300 - 0x10250 = 16428<<2
+# GD32:      10250: pcaddi $a0, 16428
+# GD32-NEXT:        bl 44
+
+## &DTPMOD(b) - . = 0x20308 - 0x10258 = 16428<<2
+# GD32:      10258: pcaddi $a0, 16428
+# GD32-NEXT:        bl 36
+
+# GD64-REL:      .rela.dyn {
+# GD64-REL-NEXT:   0x204C0 R_LARCH_TLS_DTPMOD64 a 0x0
+# GD64-REL-NEXT:   0x204C8 R_LARCH_TLS_DTPREL64 a 0x0
+# GD64-REL-NEXT:   0x204D0 R_LARCH_TLS_DTPMOD64 b 0x0
+# GD64-REL-NEXT:   0x204D8 R_LARCH_TLS_DTPREL64 b 0x0
+# GD64-REL-NEXT: }
+
+## &DTPMOD(a) - . = 0x204c0 - 0x10398 = 16458<<2
+# GD64:      10398: pcaddi $a0, 16458
+# GD64-NEXT:        bl 52
+
+## &DTPMOD(b) - . = 0x204d0 - 0x103a4 = 16460<<2
+# GD64:      103a0: pcaddi $a0, 16460
+# GD64-NEXT:        bl 44
+
+# NOREL: no relocations
+
+## .got contains pre-populated values: [a at dtpmod, a at dtprel, b at dtpmod, b at dtprel]
+## a at dtprel = st_value(a) = 0x8
+## b at dtprel = st_value(b) = 0xc
+# LE32-GOT: section '.got':
+# LE32-GOT-NEXT: 0x[[#%x,A:]] 01000000 08000000 01000000 0c000000
+# LE64-GOT: section '.got':
+# LE64-GOT-NEXT: 0x[[#%x,A:]] 01000000 00000000 08000000 00000000
+# LE64-GOT-NEXT: 0x[[#%x,A:]] 01000000 00000000 0c000000 00000000
+
+## a is local - relaxed to LE - its DTPMOD/DTPREL slots are link-time constants.
+## b is external - DTPMOD/DTPREL dynamic relocations are required.
+# IE32-REL:      .rela.dyn {
+# IE32-REL-NEXT:   0x30220 R_LARCH_TLS_DTPMOD32 b 0x0
+# IE32-REL-NEXT:   0x30224 R_LARCH_TLS_DTPREL32 b 0x0
+# IE32-REL-NEXT: }
+# IE32-GOT:      section '.got':
+# IE32-GOT-NEXT: 0x00030218 01000000 08000000 00000000 00000000
+
+# IE64-REL:      .rela.dyn {
+# IE64-REL-NEXT:   0x30380 R_LARCH_TLS_DTPMOD64 b 0x0
+# IE64-REL-NEXT:   0x30388 R_LARCH_TLS_DTPREL64 b 0x0
+# IE64-REL-NEXT: }
+# IE64-GOT:      section '.got':
+# IE64-GOT-NEXT: 0x00030370 01000000 00000000 08000000 00000000
+# IE64-GOT-NEXT: 0x00030380 00000000 00000000 00000000 00000000
+
+#--- a.s
+pcaddi $a0, %gd_pcrel_20(a)
+bl %plt(__tls_get_addr)
+
+pcaddi $a0, %gd_pcrel_20(b)
+bl %plt(__tls_get_addr)
+
+.section .tbss,"awT", at nobits
+.globl a
+.zero 8
+a:
+.zero 4
+
+#--- bc.s
+.section .tbss,"awT", at nobits
+.globl b, c
+b:
+.zero 4
+c:
+
+#--- tga.s
+.globl __tls_get_addr
+__tls_get_addr:
diff --git a/lld/test/ELF/loongarch-tls-ld-pcrel20-s2.s b/lld/test/ELF/loongarch-tls-ld-pcrel20-s2.s
new file mode 100644
index 0000000000000..70186f5538dfc
--- /dev/null
+++ b/lld/test/ELF/loongarch-tls-ld-pcrel20-s2.s
@@ -0,0 +1,82 @@
+# REQUIRES: loongarch
+# RUN: rm -rf %t && split-file %s %t
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 --position-independent %t/a.s -o %t/a.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 %t/tga.s -o %t/tga.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --position-independent %t/a.s -o %t/a.64.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 %t/tga.s -o %t/tga.64.o
+
+## LA32 LD
+# RUN: ld.lld -shared %t/a.32.o -o %t/ld.32.so
+# RUN: llvm-readobj -r %t/ld.32.so | FileCheck --check-prefix=LD32-REL %s
+# RUN: llvm-readelf -x .got %t/ld.32.so | FileCheck --check-prefix=LD32-GOT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/ld.32.so | FileCheck --check-prefixes=LD32 %s
+
+## LA32 LD -> LE
+# RUN: ld.lld %t/a.32.o %t/tga.32.o -o %t/le.32
+# RUN: llvm-readelf -r %t/le.32 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le.32 | FileCheck --check-prefix=LE32-GOT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/le.32 | FileCheck --check-prefixes=LE32 %s
+
+## LA64 LD
+# RUN: ld.lld -shared %t/a.64.o -o %t/ld.64.so
+# RUN: llvm-readobj -r %t/ld.64.so | FileCheck --check-prefix=LD64-REL %s
+# RUN: llvm-readelf -x .got %t/ld.64.so | FileCheck --check-prefix=LD64-GOT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/ld.64.so | FileCheck --check-prefixes=LD64 %s
+
+## LA64 LD -> LE
+# RUN: ld.lld %t/a.64.o %t/tga.64.o -o %t/le.64
+# RUN: llvm-readelf -r %t/le.64 | FileCheck --check-prefix=NOREL %s
+# RUN: llvm-readelf -x .got %t/le.64 | FileCheck --check-prefix=LE64-GOT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/le.64 | FileCheck --check-prefixes=LE64 %s
+
+## a at dtprel = st_value(a) = 0 is a link-time constant.
+# LD32-REL:      .rela.dyn {
+# LD32-REL-NEXT:   0x20280 R_LARCH_TLS_DTPMOD32 - 0x0
+# LD32-REL-NEXT: }
+# LD32-GOT:      section '.got':
+# LD32-GOT-NEXT: 0x00020280 00000000 00000000
+
+# LD64-REL:      .rela.dyn {
+# LD64-REL-NEXT:   0x20400 R_LARCH_TLS_DTPMOD64 - 0x0
+# LD64-REL-NEXT: }
+# LD64-GOT:      section '.got':
+# LD64-GOT-NEXT: 0x00020400 00000000 00000000 00000000 00000000
+
+## LA32: &DTPMOD(a) - . = 0x20280 - 0x101cc = 16429<<2
+# LD32:      101cc: pcaddi $a0, 16429
+# LD32-NEXT:        bl 48
+
+## LA64: &DTPMOD(a) - . = 0x20400 - 0x102e0 = 16456<<2
+# LD64:      102e0: pcaddi $a0, 16456
+# LD64-NEXT:        bl 44
+
+# NOREL: no relocations
+
+## a is local - its DTPMOD/DTPREL slots are link-time constants.
+## a at dtpmod = 1 (main module)
+# LE32-GOT: section '.got':
+# LE32-GOT-NEXT: 0x0003011c 01000000 00000000
+
+# LE64-GOT: section '.got':
+# LE64-GOT-NEXT: 0x000301d0 01000000 00000000 00000000 00000000
+
+## LA32: DTPMOD(.LANCHOR0) - . = 0x3011c - 0x20114 = 16386<<2
+# LE32:      20114: pcaddi $a0, 16386
+# LE32-NEXT:        bl 4
+
+## LA64: DTPMOD(.LANCHOR0) - . = 0x301d0 - 0x201c8 = 16386<<2
+# LE64:      201c8: pcaddi $a0, 16386
+# LE64-NEXT:        bl 4
+
+#--- a.s
+pcaddi $a0, %ld_pcrel_20(.LANCHOR0)
+bl %plt(__tls_get_addr)
+
+.section .tbss,"awT", at nobits
+.set .LANCHOR0, . + 0
+.zero 8
+
+#--- tga.s
+.globl __tls_get_addr
+__tls_get_addr:
diff --git a/lld/test/ELF/loongarch-tlsdesc-pcrel20-s2.s b/lld/test/ELF/loongarch-tlsdesc-pcrel20-s2.s
new file mode 100644
index 0000000000000..99e21d9935197
--- /dev/null
+++ b/lld/test/ELF/loongarch-tlsdesc-pcrel20-s2.s
@@ -0,0 +1,142 @@
+# REQUIRES: loongarch
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=loongarch64 a.s -o a.64.o
+# RUN: llvm-mc -filetype=obj -triple=loongarch64 c.s -o c.64.o
+# RUN: ld.lld -shared -soname=c.64.so c.64.o -o c.64.so
+# RUN: llvm-mc -filetype=obj -triple=loongarch32 --defsym ELF32=1 a.s -o a.32.o
+# RUN: llvm-mc -filetype=obj -triple=loongarch32 --defsym ELF32=1 c.s -o c.32.o
+# RUN: ld.lld -shared -soname=c.32.so c.32.o -o c.32.so
+
+# RUN: ld.lld -shared -z now a.64.o c.64.o -o a.64.so
+# RUN: llvm-readobj -r -x .got a.64.so | FileCheck --check-prefix=GD64-RELA %s
+# RUN: llvm-objdump --no-show-raw-insn -h -d a.64.so | FileCheck %s --check-prefix=GD64
+
+# RUN: ld.lld -shared -z now a.64.o c.64.o -o rel.64.so -z rel
+# RUN: llvm-readobj -r -x .got rel.64.so | FileCheck --check-prefix=GD64-REL %s
+
+## FIXME: The transition frome TLSDESC to IE/LE has not yet been implemented.
+## Keep the dynamic relocations and hand them over to dynamic linker.
+
+# RUN: ld.lld -e 0 -z now a.64.o c.64.o -o a.64.le
+# RUN: llvm-readobj -r -x .got a.64.le | FileCheck --check-prefix=LE64-RELA %s
+
+# RUN: ld.lld -e 0 -z now a.64.o c.64.so -o a.64.ie
+# RUN: llvm-readobj -r -x .got a.64.ie | FileCheck --check-prefix=IE64-RELA %s
+
+## 32-bit code is mostly the same. We only test a few variants.
+
+# RUN: ld.lld -shared -z now a.32.o c.32.o -o rel.32.so -z rel
+# RUN: llvm-readobj -r -x .got rel.32.so | FileCheck --check-prefix=GD32-REL %s
+
+# GD64-RELA:      .rela.dyn {
+# GD64-RELA-NEXT:   0x203F0 R_LARCH_TLS_DESC64 - 0x7FF
+# GD64-RELA-NEXT:   0x203D0 R_LARCH_TLS_DESC64 a 0x0
+# GD64-RELA-NEXT:   0x203E0 R_LARCH_TLS_DESC64 c 0x0
+# GD64-RELA-NEXT: }
+# GD64-RELA:      Hex dump of section '.got':
+# GD64-RELA-NEXT: 0x000203d0 00000000 00000000 00000000 00000000 .
+# GD64-RELA-NEXT: 0x000203e0 00000000 00000000 00000000 00000000 .
+# GD64-RELA-NEXT: 0x000203f0 00000000 00000000 00000000 00000000 .
+
+# GD64-REL:      .rel.dyn {
+# GD64-REL-NEXT:   0x203D8 R_LARCH_TLS_DESC64 -
+# GD64-REL-NEXT:   0x203B8 R_LARCH_TLS_DESC64 a
+# GD64-REL-NEXT:   0x203C8 R_LARCH_TLS_DESC64 c
+# GD64-REL-NEXT: }
+# GD64-REL:      Hex dump of section '.got':
+# GD64-REL-NEXT: 0x000203b8 00000000 00000000 00000000 00000000 .
+# GD64-REL-NEXT: 0x000203c8 00000000 00000000 00000000 00000000 .
+# GD64-REL-NEXT: 0x000203d8 00000000 00000000 ff070000 00000000 .
+
+# GD64:      .got     00000030 00000000000203d0
+
+## &.got[a]-. = 0x203d0 - 0x102e0 = 16444<<2
+# GD64:        102e0: pcaddi $a0, 16444
+# GD64-NEXT:          ld.d $ra, $a0, 0
+# GD64-NEXT:          jirl $ra, $ra, 0
+# GD64-NEXT:          add.d $a1, $a0, $tp
+
+## &.got[b]-. = 0x203d0+32 - 0x102f0 = 16448<<2
+# GD64:        102f0: pcaddi $a0, 16448
+# GD64-NEXT:          ld.d $ra, $a0, 0
+# GD64-NEXT:          jirl $ra, $ra, 0
+# GD64-NEXT:          add.d $a2, $a0, $tp
+
+## &.got[c]-. = 0x203d0+16 - 0x10300 = 16440<<2
+# GD64:        10300: pcaddi $a0, 16440
+# GD64-NEXT:          ld.d $ra, $a0, 0
+# GD64-NEXT:          jirl $ra, $ra, 0
+# GD64-NEXT:          add.d $a3, $a0, $tp
+
+# LE64-RELA:      .rela.dyn {
+# LE64-RELA-NEXT:   0x30240 R_LARCH_TLS_DESC64 - 0x8
+# LE64-RELA-NEXT:   0x30250 R_LARCH_TLS_DESC64 - 0x800
+# LE64-RELA-NEXT:   0x30260 R_LARCH_TLS_DESC64 - 0x7FF
+# LE64-RELA-NEXT: }
+# LE64-RELA:      Hex dump of section '.got':
+# LE64-RELA-NEXT: 0x00030240 00000000 00000000 00000000 00000000 .
+# LE64-RELA-NEXT: 0x00030250 00000000 00000000 00000000 00000000 .
+# LE64-RELA-NEXT: 0x00030260 00000000 00000000 00000000 00000000 .
+
+# IE64-RELA:      .rela.dyn {
+# IE64-RELA-NEXT:   0x303C8 R_LARCH_TLS_DESC64 - 0x8
+# IE64-RELA-NEXT:   0x303E8 R_LARCH_TLS_DESC64 - 0x7FF
+# IE64-RELA-NEXT:   0x303D8 R_LARCH_TLS_DESC64 c 0x0
+# IE64-RELA-NEXT: }
+# IE64-RELA:      Hex dump of section '.got':
+# IE64-RELA-NEXT: 0x000303c8 00000000 00000000 00000000 00000000 .
+# IE64-RELA-NEXT: 0x000303d8 00000000 00000000 00000000 00000000 .
+# IE64-RELA-NEXT: 0x000303e8 00000000 00000000 00000000 00000000 .
+
+# GD32-REL:      .rel.dyn {
+# GD32-REL-NEXT:    0x20264 R_LARCH_TLS_DESC32 -
+# GD32-REL-NEXT:    0x20254 R_LARCH_TLS_DESC32 a
+# GD32-REL-NEXT:    0x2025C R_LARCH_TLS_DESC32 c
+# GD32-REL-NEXT: }
+# GD32-REL:      Hex dump of section '.got':
+# GD32-REL-NEXT: 0x00020254 00000000 00000000 00000000 00000000 .
+# GD32-REL-NEXT: 0x00020264 00000000 ff070000                   .
+
+#--- a.s
+.macro add dst, src1, src2
+.ifdef ELF32
+add.w \dst, \src1, \src2
+.else
+add.d \dst, \src1, \src2
+.endif
+.endm
+.macro load dst, src1, src2
+.ifdef ELF32
+ld.w \dst, \src1, \src2
+.else
+ld.d \dst, \src1, \src2
+.endif
+.endm
+
+pcaddi $a0, %desc_pcrel_20(a)
+load $ra, $a0, %desc_ld(a)
+jirl $ra, $ra, %desc_call(a)
+add $a1, $a0, $tp
+
+pcaddi $a0, %desc_pcrel_20(b)
+load $ra, $a0, %desc_ld(b)
+jirl $ra, $ra, %desc_call(b)
+add $a2, $a0, $tp
+
+pcaddi $a0, %desc_pcrel_20(c)
+load $ra, $a0, %desc_ld(c)
+jirl $ra, $ra, %desc_call(c)
+add $a3, $a0, $tp
+
+.section .tbss,"awT", at nobits
+.globl a
+.zero 8
+a:
+.zero 2039  ## Place b at 0x7ff
+b:
+.zero 1
+
+#--- c.s
+.section .tbss,"awT", at nobits
+.globl c
+c: .zero 4

>From 92a06546ab50c2b6beac766d989b30960c353e61 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Fri, 26 Jul 2024 08:43:30 +0200
Subject: [PATCH 235/259] [LowerMemIntrinsics] Lower llvm.memmove to wide
 memory accesses (#100122)

So far, the IR-level lowering of llvm.memmove intrinsics generates loops
that copy each byte individually. This can be wasteful for targets that
provide wider memory access operations.

This patch makes the memmove lowering more similar to the lowering of
memcpy with unknown length.
TargetTransformInfo::getMemcpyLoopLoweringType() is queried for an
adequate type for the memory accesses, and if it is wider than a single
byte, the greatest multiple of the type's size that is less than or
equal to the length is copied with corresponding wide memory accesses. A
residual loop with byte-wise accesses (or a sequence of suitable memory
accesses in case the length is statically known) is introduced for the
remaining bytes.

For memmove, this construct is required in two variants: one for copying
forward and one for copying backwards, to handle overlapping memory
ranges. For the backwards case, the residual code still covers the bytes
at the end of the copied region and is therefore executed before the
wide main loop. This implementation choice is based on the assumption
that we are more likely to encounter memory ranges whose start aligns
with the access width than ones whose end does.

In microbenchmarks on gfx1030 (AMDGPU), this change yields speedups up
to 16x for memmoves with variable or large constant lengths.

Part of SWDEV-455845.
---
 .../Transforms/Utils/LowerMemIntrinsics.cpp   | 515 ++++++++++--
 .../CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll | 103 ++-
 .../CodeGen/AMDGPU/lower-mem-intrinsics.ll    | 787 ++++++++++++++----
 llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll  |   2 +-
 4 files changed, 1121 insertions(+), 286 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index b38db412f786a..ee0d95bbce64f 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -347,6 +347,30 @@ void llvm::createMemCpyLoopUnknownSize(
   }
 }
 
+// If \p Addr1 and \p Addr2 are pointers to different address spaces, create an
+// addresspacecast to obtain a pair of pointers in the same addressspace. The
+// caller needs to ensure that addrspacecasting is possible.
+// No-op if the pointers are in the same address space.
+static std::pair<Value *, Value *>
+tryInsertCastToCommonAddrSpace(IRBuilderBase &B, Value *Addr1, Value *Addr2,
+                               const TargetTransformInfo &TTI) {
+  Value *ResAddr1 = Addr1;
+  Value *ResAddr2 = Addr2;
+
+  unsigned AS1 = cast<PointerType>(Addr1->getType())->getAddressSpace();
+  unsigned AS2 = cast<PointerType>(Addr2->getType())->getAddressSpace();
+  if (AS1 != AS2) {
+    if (TTI.isValidAddrSpaceCast(AS2, AS1))
+      ResAddr2 = B.CreateAddrSpaceCast(Addr2, Addr1->getType());
+    else if (TTI.isValidAddrSpaceCast(AS1, AS2))
+      ResAddr1 = B.CreateAddrSpaceCast(Addr1, Addr2->getType());
+    else
+      llvm_unreachable("Can only lower memmove between address spaces if they "
+                       "support addrspacecast");
+  }
+  return {ResAddr1, ResAddr2};
+}
+
 // Lower memmove to IR. memmove is required to correctly copy overlapping memory
 // regions; therefore, it has to check the relative positions of the source and
 // destination pointers and choose the copy direction accordingly.
@@ -369,17 +393,61 @@ void llvm::createMemCpyLoopUnknownSize(
 //   }
 //   return dst;
 // }
-static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
-                              Value *DstAddr, Value *CopyLen, Align SrcAlign,
-                              Align DstAlign, bool SrcIsVolatile,
-                              bool DstIsVolatile,
-                              const TargetTransformInfo &TTI) {
+//
+// If the TargetTransformInfo specifies a wider MemcpyLoopLoweringType, it is
+// used for the memory accesses in the loops. Then, additional loops with
+// byte-wise accesses are added for the remaining bytes.
+static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
+                                         Value *SrcAddr, Value *DstAddr,
+                                         Value *CopyLen, Align SrcAlign,
+                                         Align DstAlign, bool SrcIsVolatile,
+                                         bool DstIsVolatile,
+                                         const TargetTransformInfo &TTI) {
   Type *TypeOfCopyLen = CopyLen->getType();
   BasicBlock *OrigBB = InsertBefore->getParent();
   Function *F = OrigBB->getParent();
   const DataLayout &DL = F->getDataLayout();
-  // TODO: Use different element type if possible?
-  Type *EltTy = Type::getInt8Ty(F->getContext());
+  LLVMContext &Ctx = OrigBB->getContext();
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
+  unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+  Type *Int8Type = Type::getInt8Ty(Ctx);
+  bool LoopOpIsInt8 = LoopOpType == Int8Type;
+
+  // If the memory accesses are wider than one byte, residual loops with
+  // i8-accesses are required to move remaining bytes.
+  bool RequiresResidual = !LoopOpIsInt8;
+
+  Type *ResidualLoopOpType = Int8Type;
+  unsigned ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
+
+  // Calculate the loop trip count and remaining bytes to copy after the loop.
+  IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
+  ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
+  ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
+  ConstantInt *One = ConstantInt::get(ILengthType, 1);
+
+  IRBuilder<> PLBuilder(InsertBefore);
+
+  Value *RuntimeLoopCount = CopyLen;
+  Value *RuntimeLoopRemainder = nullptr;
+  Value *RuntimeBytesCopiedMainLoop = CopyLen;
+  Value *SkipResidualCondition = nullptr;
+  if (RequiresResidual) {
+    RuntimeLoopCount =
+        getRuntimeLoopCount(DL, PLBuilder, CopyLen, CILoopOpSize, LoopOpSize);
+    RuntimeLoopRemainder = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen,
+                                                   CILoopOpSize, LoopOpSize);
+    RuntimeBytesCopiedMainLoop =
+        PLBuilder.CreateSub(CopyLen, RuntimeLoopRemainder);
+    SkipResidualCondition =
+        PLBuilder.CreateICmpEQ(RuntimeLoopRemainder, Zero, "skip_residual");
+  }
+  Value *SkipMainCondition =
+      PLBuilder.CreateICmpEQ(RuntimeLoopCount, Zero, "skip_main");
 
   // Create the a comparison of src and dst, based on which we jump to either
   // the forward-copy part of the function (if src >= dst) or the backwards-copy
@@ -387,76 +455,374 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
   // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
   // structure. Its block terminators (unconditional branches) are replaced by
   // the appropriate conditional branches when the loop is built.
-  ICmpInst *PtrCompare = new ICmpInst(InsertBefore->getIterator(), ICmpInst::ICMP_ULT,
-                                      SrcAddr, DstAddr, "compare_src_dst");
+  // If the pointers are in different address spaces, they need to be converted
+  // to a compatible one. Cases where memory ranges in the different address
+  // spaces cannot overlap are lowered as memcpy and not handled here.
+  auto [CmpSrcAddr, CmpDstAddr] =
+      tryInsertCastToCommonAddrSpace(PLBuilder, SrcAddr, DstAddr, TTI);
+  Value *PtrCompare =
+      PLBuilder.CreateICmpULT(CmpSrcAddr, CmpDstAddr, "compare_src_dst");
   Instruction *ThenTerm, *ElseTerm;
-  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(), &ThenTerm,
-                                &ElseTerm);
-
-  // Each part of the function consists of two blocks:
-  //   copy_backwards:        used to skip the loop when n == 0
-  //   copy_backwards_loop:   the actual backwards loop BB
-  //   copy_forward:          used to skip the loop when n == 0
-  //   copy_forward_loop:     the actual forward loop BB
+  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
+                                &ThenTerm, &ElseTerm);
+
+  // If the LoopOpSize is greater than 1, each part of the function consist of
+  // four blocks:
+  //   memmove_copy_backwards:
+  //       skip the residual loop when 0 iterations are required
+  //   memmove_bwd_residual_loop:
+  //       copy the last few bytes individually so that the remaining length is
+  //       a multiple of the LoopOpSize
+  //   memmove_bwd_middle: skip the main loop when 0 iterations are required
+  //   memmove_bwd_main_loop: the actual backwards loop BB with wide accesses
+  //   memmove_copy_forward: skip the main loop when 0 iterations are required
+  //   memmove_fwd_main_loop: the actual forward loop BB with wide accesses
+  //   memmove_fwd_middle: skip the residual loop when 0 iterations are required
+  //   memmove_fwd_residual_loop: copy the last few bytes individually
+  //
+  // The main and residual loop are switched between copying forward and
+  // backward so that the residual loop always operates on the end of the moved
+  // range. This is based on the assumption that buffers whose start is aligned
+  // with the LoopOpSize are more common than buffers whose end is.
+  //
+  // If the LoopOpSize is 1, each part of the function consists of two blocks:
+  //   memmove_copy_backwards: skip the loop when 0 iterations are required
+  //   memmove_bwd_main_loop: the actual backwards loop BB
+  //   memmove_copy_forward: skip the loop when 0 iterations are required
+  //   memmove_fwd_main_loop: the actual forward loop BB
   BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
-  CopyBackwardsBB->setName("copy_backwards");
+  CopyBackwardsBB->setName("memmove_copy_backwards");
   BasicBlock *CopyForwardBB = ElseTerm->getParent();
-  CopyForwardBB->setName("copy_forward");
+  CopyForwardBB->setName("memmove_copy_forward");
   BasicBlock *ExitBB = InsertBefore->getParent();
   ExitBB->setName("memmove_done");
 
-  unsigned PartSize = DL.getTypeStoreSize(EltTy);
-  Align PartSrcAlign(commonAlignment(SrcAlign, PartSize));
-  Align PartDstAlign(commonAlignment(DstAlign, PartSize));
+  Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
+  Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
 
-  // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
-  // between both backwards and forward copy clauses.
-  ICmpInst *CompareN =
-      new ICmpInst(OrigBB->getTerminator()->getIterator(), ICmpInst::ICMP_EQ, CopyLen,
-                   ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
+  // Accesses in the residual loops do not share the same alignment as those in
+  // the main loops.
+  Align ResidualSrcAlign(commonAlignment(PartSrcAlign, ResidualLoopOpSize));
+  Align ResidualDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize));
 
   // Copying backwards.
-  BasicBlock *LoopBB =
-    BasicBlock::Create(F->getContext(), "copy_backwards_loop", F, CopyForwardBB);
-  IRBuilder<> LoopBuilder(LoopBB);
+  {
+    BasicBlock *MainLoopBB = BasicBlock::Create(
+        F->getContext(), "memmove_bwd_main_loop", F, CopyForwardBB);
+
+    // The predecessor of the memmove_bwd_main_loop. Updated in the
+    // following if a residual loop is emitted first.
+    BasicBlock *PredBB = CopyBackwardsBB;
+
+    if (RequiresResidual) {
+      // backwards residual loop
+      BasicBlock *ResidualLoopBB = BasicBlock::Create(
+          F->getContext(), "memmove_bwd_residual_loop", F, MainLoopBB);
+      IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
+      PHINode *ResidualLoopPhi = ResidualLoopBuilder.CreatePHI(ILengthType, 0);
+      Value *ResidualIndex = ResidualLoopBuilder.CreateSub(
+          ResidualLoopPhi, One, "bwd_residual_index");
+      Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(
+          ResidualLoopOpType, SrcAddr, ResidualIndex);
+      Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
+          ResidualLoopOpType, LoadGEP, ResidualSrcAlign, SrcIsVolatile,
+          "element");
+      Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(
+          ResidualLoopOpType, DstAddr, ResidualIndex);
+      ResidualLoopBuilder.CreateAlignedStore(Element, StoreGEP,
+                                             ResidualDstAlign, DstIsVolatile);
+
+      // After the residual loop, go to an intermediate block.
+      BasicBlock *IntermediateBB = BasicBlock::Create(
+          F->getContext(), "memmove_bwd_middle", F, MainLoopBB);
+      // Later code expects a terminator in the PredBB.
+      IRBuilder<> IntermediateBuilder(IntermediateBB);
+      IntermediateBuilder.CreateUnreachable();
+      ResidualLoopBuilder.CreateCondBr(
+          ResidualLoopBuilder.CreateICmpEQ(ResidualIndex,
+                                           RuntimeBytesCopiedMainLoop),
+          IntermediateBB, ResidualLoopBB);
+
+      ResidualLoopPhi->addIncoming(ResidualIndex, ResidualLoopBB);
+      ResidualLoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
+
+      // How to get to the residual:
+      BranchInst::Create(IntermediateBB, ResidualLoopBB, SkipResidualCondition,
+                         ThenTerm->getIterator());
+      ThenTerm->eraseFromParent();
+
+      PredBB = IntermediateBB;
+    }
 
-  PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
-  Value *IndexPtr = LoopBuilder.CreateSub(
-      LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
-  Value *Element = LoopBuilder.CreateAlignedLoad(
-      EltTy, LoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, IndexPtr),
-      PartSrcAlign, SrcIsVolatile, "element");
-  LoopBuilder.CreateAlignedStore(
-      Element, LoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, IndexPtr),
-      PartDstAlign, DstIsVolatile);
-  LoopBuilder.CreateCondBr(
-      LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
-      ExitBB, LoopBB);
-  LoopPhi->addIncoming(IndexPtr, LoopBB);
-  LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
-  BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm->getIterator());
-  ThenTerm->eraseFromParent();
+    // main loop
+    IRBuilder<> MainLoopBuilder(MainLoopBB);
+    PHINode *MainLoopPhi = MainLoopBuilder.CreatePHI(ILengthType, 0);
+    Value *MainIndex =
+        MainLoopBuilder.CreateSub(MainLoopPhi, One, "bwd_main_index");
+    Value *LoadGEP =
+        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, MainIndex);
+    Value *Element = MainLoopBuilder.CreateAlignedLoad(
+        LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
+    Value *StoreGEP =
+        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, MainIndex);
+    MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
+                                       DstIsVolatile);
+    MainLoopBuilder.CreateCondBr(MainLoopBuilder.CreateICmpEQ(MainIndex, Zero),
+                                 ExitBB, MainLoopBB);
+    MainLoopPhi->addIncoming(MainIndex, MainLoopBB);
+    MainLoopPhi->addIncoming(RuntimeLoopCount, PredBB);
+
+    // How to get to the main loop:
+    Instruction *PredBBTerm = PredBB->getTerminator();
+    BranchInst::Create(ExitBB, MainLoopBB, SkipMainCondition,
+                       PredBBTerm->getIterator());
+    PredBBTerm->eraseFromParent();
+  }
 
   // Copying forward.
-  BasicBlock *FwdLoopBB =
-    BasicBlock::Create(F->getContext(), "copy_forward_loop", F, ExitBB);
-  IRBuilder<> FwdLoopBuilder(FwdLoopBB);
-  PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
-  Value *SrcGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, FwdCopyPhi);
-  Value *FwdElement = FwdLoopBuilder.CreateAlignedLoad(
-      EltTy, SrcGEP, PartSrcAlign, SrcIsVolatile, "element");
-  Value *DstGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, FwdCopyPhi);
-  FwdLoopBuilder.CreateAlignedStore(FwdElement, DstGEP, PartDstAlign,
-                                    DstIsVolatile);
-  Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
-      FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
-  FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
-                              ExitBB, FwdLoopBB);
-  FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
-  FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
-
-  BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm->getIterator());
-  ElseTerm->eraseFromParent();
+  // main loop
+  {
+    BasicBlock *MainLoopBB =
+        BasicBlock::Create(F->getContext(), "memmove_fwd_main_loop", F, ExitBB);
+    IRBuilder<> MainLoopBuilder(MainLoopBB);
+    PHINode *MainLoopPhi =
+        MainLoopBuilder.CreatePHI(ILengthType, 0, "fwd_main_index");
+    Value *LoadGEP =
+        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, MainLoopPhi);
+    Value *Element = MainLoopBuilder.CreateAlignedLoad(
+        LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
+    Value *StoreGEP =
+        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, MainLoopPhi);
+    MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
+                                       DstIsVolatile);
+    Value *MainIndex = MainLoopBuilder.CreateAdd(MainLoopPhi, One);
+    MainLoopPhi->addIncoming(MainIndex, MainLoopBB);
+    MainLoopPhi->addIncoming(Zero, CopyForwardBB);
+
+    Instruction *CopyFwdBBTerm = CopyForwardBB->getTerminator();
+    BasicBlock *SuccessorBB = ExitBB;
+    if (RequiresResidual)
+      SuccessorBB =
+          BasicBlock::Create(F->getContext(), "memmove_fwd_middle", F, ExitBB);
+
+    // leaving or staying in the main loop
+    MainLoopBuilder.CreateCondBr(
+        MainLoopBuilder.CreateICmpEQ(MainIndex, RuntimeLoopCount), SuccessorBB,
+        MainLoopBB);
+
+    // getting in or skipping the main loop
+    BranchInst::Create(SuccessorBB, MainLoopBB, SkipMainCondition,
+                       CopyFwdBBTerm->getIterator());
+    CopyFwdBBTerm->eraseFromParent();
+
+    if (RequiresResidual) {
+      BasicBlock *IntermediateBB = SuccessorBB;
+      IRBuilder<> IntermediateBuilder(IntermediateBB);
+      BasicBlock *ResidualLoopBB = BasicBlock::Create(
+          F->getContext(), "memmove_fwd_residual_loop", F, ExitBB);
+      IntermediateBuilder.CreateCondBr(SkipResidualCondition, ExitBB,
+                                       ResidualLoopBB);
+
+      // Residual loop
+      IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
+      PHINode *ResidualLoopPhi =
+          ResidualLoopBuilder.CreatePHI(ILengthType, 0, "fwd_residual_index");
+      Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(
+          ResidualLoopOpType, SrcAddr, ResidualLoopPhi);
+      Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
+          ResidualLoopOpType, LoadGEP, ResidualSrcAlign, SrcIsVolatile,
+          "element");
+      Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(
+          ResidualLoopOpType, DstAddr, ResidualLoopPhi);
+      ResidualLoopBuilder.CreateAlignedStore(Element, StoreGEP,
+                                             ResidualDstAlign, DstIsVolatile);
+      Value *ResidualIndex =
+          ResidualLoopBuilder.CreateAdd(ResidualLoopPhi, One);
+      ResidualLoopBuilder.CreateCondBr(
+          ResidualLoopBuilder.CreateICmpEQ(ResidualIndex, CopyLen), ExitBB,
+          ResidualLoopBB);
+      ResidualLoopPhi->addIncoming(ResidualIndex, ResidualLoopBB);
+      ResidualLoopPhi->addIncoming(RuntimeBytesCopiedMainLoop, IntermediateBB);
+    }
+  }
+}
+
+// Similar to createMemMoveLoopUnknownSize, only the trip counts are computed at
+// compile time, obsolete loops and branches are omitted, and the residual code
+// is straight-line code instead of a loop.
+static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
+                                       Value *SrcAddr, Value *DstAddr,
+                                       ConstantInt *CopyLen, Align SrcAlign,
+                                       Align DstAlign, bool SrcIsVolatile,
+                                       bool DstIsVolatile,
+                                       const TargetTransformInfo &TTI) {
+  // No need to expand zero length moves.
+  if (CopyLen->isZero())
+    return;
+
+  Type *TypeOfCopyLen = CopyLen->getType();
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+  const DataLayout &DL = F->getDataLayout();
+  LLVMContext &Ctx = OrigBB->getContext();
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
+  unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+
+  // Calculate the loop trip count and remaining bytes to copy after the loop.
+  uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize;
+  uint64_t BytesCopiedInLoop = LoopEndCount * LoopOpSize;
+  uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopiedInLoop;
+
+  IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
+  ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
+  ConstantInt *One = ConstantInt::get(ILengthType, 1);
+  ConstantInt *TripCount = ConstantInt::get(ILengthType, LoopEndCount);
+
+  IRBuilder<> PLBuilder(InsertBefore);
+
+  auto [CmpSrcAddr, CmpDstAddr] =
+      tryInsertCastToCommonAddrSpace(PLBuilder, SrcAddr, DstAddr, TTI);
+  Value *PtrCompare =
+      PLBuilder.CreateICmpULT(CmpSrcAddr, CmpDstAddr, "compare_src_dst");
+  Instruction *ThenTerm, *ElseTerm;
+  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
+                                &ThenTerm, &ElseTerm);
+
+  BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
+  BasicBlock *CopyForwardBB = ElseTerm->getParent();
+  BasicBlock *ExitBB = InsertBefore->getParent();
+  ExitBB->setName("memmove_done");
+
+  Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
+  Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
+
+  // Helper function to generate a load/store pair of a given type in the
+  // residual. Used in the forward and backward branches.
+  auto GenerateResidualLdStPair = [&](Type *OpTy, IRBuilderBase &Builder,
+                                      uint64_t &BytesCopied) {
+    Align ResSrcAlign(commonAlignment(SrcAlign, BytesCopied));
+    Align ResDstAlign(commonAlignment(DstAlign, BytesCopied));
+
+    // Calculate the new index
+    unsigned OperandSize = DL.getTypeStoreSize(OpTy);
+
+    uint64_t GepIndex = BytesCopied / OperandSize;
+    assert(GepIndex * OperandSize == BytesCopied &&
+           "Division should have no Remainder!");
+
+    Value *SrcGEP = Builder.CreateInBoundsGEP(
+        OpTy, SrcAddr, ConstantInt::get(TypeOfCopyLen, GepIndex));
+    LoadInst *Load =
+        Builder.CreateAlignedLoad(OpTy, SrcGEP, ResSrcAlign, SrcIsVolatile);
+    Value *DstGEP = Builder.CreateInBoundsGEP(
+        OpTy, DstAddr, ConstantInt::get(TypeOfCopyLen, GepIndex));
+    Builder.CreateAlignedStore(Load, DstGEP, ResDstAlign, DstIsVolatile);
+    BytesCopied += OperandSize;
+  };
+
+  // Copying backwards.
+  if (RemainingBytes != 0) {
+    CopyBackwardsBB->setName("memmove_bwd_residual");
+    uint64_t BytesCopied = BytesCopiedInLoop;
+
+    // Residual code is required to move the remaining bytes. We need the same
+    // instructions as in the forward case, only in reverse. So we generate code
+    // the same way, except that we change the IRBuilder insert point for each
+    // load/store pair so that each one is inserted before the previous one
+    // instead of after it.
+    IRBuilder<> BwdResBuilder(CopyBackwardsBB->getFirstNonPHI());
+    SmallVector<Type *, 5> RemainingOps;
+    TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
+                                          SrcAS, DstAS, PartSrcAlign.value(),
+                                          PartDstAlign.value());
+    for (auto *OpTy : RemainingOps) {
+      // reverse the order of the emitted operations
+      BwdResBuilder.SetInsertPoint(CopyBackwardsBB->getFirstNonPHI());
+      GenerateResidualLdStPair(OpTy, BwdResBuilder, BytesCopied);
+    }
+  }
+  if (LoopEndCount != 0) {
+    BasicBlock *LoopBB = CopyBackwardsBB;
+    BasicBlock *PredBB = OrigBB;
+    if (RemainingBytes != 0) {
+      // if we introduce residual code, it needs its separate BB
+      LoopBB = CopyBackwardsBB->splitBasicBlock(
+          CopyBackwardsBB->getTerminator(), "memmove_bwd_loop");
+      PredBB = CopyBackwardsBB;
+    } else {
+      CopyBackwardsBB->setName("memmove_bwd_loop");
+    }
+    IRBuilder<> LoopBuilder(LoopBB->getTerminator());
+    PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0);
+    Value *Index = LoopBuilder.CreateSub(LoopPhi, One, "bwd_index");
+    Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, Index);
+    Value *Element = LoopBuilder.CreateAlignedLoad(
+        LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
+    Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, Index);
+    LoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
+                                   DstIsVolatile);
+
+    // Replace the unconditional branch introduced by
+    // SplitBlockAndInsertIfThenElse to turn LoopBB into a loop.
+    Instruction *UncondTerm = LoopBB->getTerminator();
+    LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpEQ(Index, Zero), ExitBB,
+                             LoopBB);
+    UncondTerm->eraseFromParent();
+
+    LoopPhi->addIncoming(Index, LoopBB);
+    LoopPhi->addIncoming(TripCount, PredBB);
+  }
+
+  // Copying forward.
+  BasicBlock *FwdResidualBB = CopyForwardBB;
+  if (LoopEndCount != 0) {
+    CopyForwardBB->setName("memmove_fwd_loop");
+    BasicBlock *LoopBB = CopyForwardBB;
+    BasicBlock *SuccBB = ExitBB;
+    if (RemainingBytes != 0) {
+      // if we introduce residual code, it needs its separate BB
+      SuccBB = CopyForwardBB->splitBasicBlock(CopyForwardBB->getTerminator(),
+                                              "memmove_fwd_residual");
+      FwdResidualBB = SuccBB;
+    }
+    IRBuilder<> LoopBuilder(LoopBB->getTerminator());
+    PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0, "fwd_index");
+    Value *LoadGEP =
+        LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopPhi);
+    Value *Element = LoopBuilder.CreateAlignedLoad(
+        LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
+    Value *StoreGEP =
+        LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopPhi);
+    LoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
+                                   DstIsVolatile);
+    Value *Index = LoopBuilder.CreateAdd(LoopPhi, One);
+    LoopPhi->addIncoming(Index, LoopBB);
+    LoopPhi->addIncoming(Zero, OrigBB);
+
+    // Replace the unconditional branch to turn LoopBB into a loop.
+    Instruction *UncondTerm = LoopBB->getTerminator();
+    LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpEQ(Index, TripCount), SuccBB,
+                             LoopBB);
+    UncondTerm->eraseFromParent();
+  }
+
+  if (RemainingBytes != 0) {
+    uint64_t BytesCopied = BytesCopiedInLoop;
+
+    // Residual code is required to move the remaining bytes. In the forward
+    // case, we emit it in the normal order.
+    IRBuilder<> FwdResBuilder(FwdResidualBB->getTerminator());
+    SmallVector<Type *, 5> RemainingOps;
+    TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
+                                          SrcAS, DstAS, PartSrcAlign.value(),
+                                          PartDstAlign.value());
+    for (auto *OpTy : RemainingOps)
+      GenerateResidualLdStPair(OpTy, FwdResBuilder, BytesCopied);
+  }
 }
 
 static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
@@ -572,11 +938,8 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
       return true;
     }
 
-    if (TTI.isValidAddrSpaceCast(DstAS, SrcAS))
-      DstAddr = CastBuilder.CreateAddrSpaceCast(DstAddr, SrcAddr->getType());
-    else if (TTI.isValidAddrSpaceCast(SrcAS, DstAS))
-      SrcAddr = CastBuilder.CreateAddrSpaceCast(SrcAddr, DstAddr->getType());
-    else {
+    if (!(TTI.isValidAddrSpaceCast(DstAS, SrcAS) ||
+          TTI.isValidAddrSpaceCast(SrcAS, DstAS))) {
       // We don't know generically if it's legal to introduce an
       // addrspacecast. We need to know either if it's legal to insert an
       // addrspacecast, or if the address spaces cannot alias.
@@ -587,9 +950,15 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
     }
   }
 
-  createMemMoveLoop(
-      /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign, DstAlign,
-      SrcIsVolatile, DstIsVolatile, TTI);
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(CopyLen)) {
+    createMemMoveLoopKnownSize(
+        /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CI, SrcAlign, DstAlign,
+        SrcIsVolatile, DstIsVolatile, TTI);
+  } else {
+    createMemMoveLoopUnknownSize(
+        /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign, DstAlign,
+        SrcIsVolatile, DstIsVolatile, TTI);
+  }
   return true;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
index 4d4da869d7507..de9af5209db16 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
@@ -9,56 +9,71 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src
 ; LOOP:       ; %bb.0:
 ; LOOP-NEXT:    v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1]
 ; LOOP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; LOOP-NEXT:    s_xor_b64 s[4:5], exec, s[0:1]
-; LOOP-NEXT:    s_cbranch_execz .LBB0_3
-; LOOP-NEXT:  ; %bb.1: ; %copy_forward
-; LOOP-NEXT:    s_mov_b64 s[6:7], 0
+; LOOP-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; LOOP-NEXT:    s_cbranch_execnz .LBB0_3
+; LOOP-NEXT:  ; %bb.1: ; %Flow
+; LOOP-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; LOOP-NEXT:    s_cbranch_execnz .LBB0_4
+; LOOP-NEXT:  .LBB0_2: ; %memmove_done
+; LOOP-NEXT:    s_endpgm
+; LOOP-NEXT:  .LBB0_3:
+; LOOP-NEXT:    s_mov_b32 s6, 0
+; LOOP-NEXT:    s_mov_b32 s7, 0xf000
+; LOOP-NEXT:    s_mov_b64 s[4:5], 0
+; LOOP-NEXT:    buffer_load_ubyte v4, v[2:3], s[4:7], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_load_ubyte v5, v[2:3], s[4:7], 0 addr64 offset:3
+; LOOP-NEXT:    buffer_load_ubyte v6, v[2:3], s[4:7], 0 addr64 offset:2
+; LOOP-NEXT:    buffer_load_ubyte v2, v[2:3], s[4:7], 0 addr64
+; LOOP-NEXT:    s_waitcnt vmcnt(3)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
+; LOOP-NEXT:    s_waitcnt vmcnt(2)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
+; LOOP-NEXT:    s_waitcnt vmcnt(1)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; LOOP-NEXT:    s_waitcnt vmcnt(0)
+; LOOP-NEXT:    v_or_b32_e32 v2, v3, v2
+; LOOP-NEXT:    v_or_b32_e32 v3, v4, v5
+; LOOP-NEXT:    v_or_b32_e32 v2, v3, v2
+; LOOP-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; LOOP-NEXT:    v_bfe_u32 v4, v2, 8, 8
+; LOOP-NEXT:    buffer_store_byte v2, v[0:1], s[4:7], 0 addr64
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[4:7], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_store_byte v3, v[0:1], s[4:7], 0 addr64 offset:2
+; LOOP-NEXT:    buffer_store_byte v2, v[0:1], s[4:7], 0 addr64 offset:3
+; LOOP-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; LOOP-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; LOOP-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; LOOP-NEXT:    s_cbranch_execz .LBB0_2
+; LOOP-NEXT:  .LBB0_4: ; %memmove_bwd_residual
 ; LOOP-NEXT:    s_mov_b32 s2, 0
 ; LOOP-NEXT:    s_mov_b32 s3, 0xf000
 ; LOOP-NEXT:    s_mov_b64 s[0:1], 0
-; LOOP-NEXT:    v_mov_b32_e32 v4, s6
-; LOOP-NEXT:    v_mov_b32_e32 v5, s7
-; LOOP-NEXT:  .LBB0_2: ; %copy_forward_loop
-; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; LOOP-NEXT:    v_add_i32_e32 v6, vcc, v2, v4
-; LOOP-NEXT:    v_addc_u32_e32 v7, vcc, v3, v5, vcc
+; LOOP-NEXT:    s_waitcnt expcnt(2)
+; LOOP-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:3
+; LOOP-NEXT:    buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:2
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    buffer_load_ubyte v8, v[6:7], s[0:3], 0 addr64
-; LOOP-NEXT:    v_add_i32_e32 v6, vcc, v0, v4
-; LOOP-NEXT:    v_addc_u32_e32 v7, vcc, v1, v5, vcc
-; LOOP-NEXT:    v_add_i32_e32 v4, vcc, 1, v4
-; LOOP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; LOOP-NEXT:    v_cmp_ne_u32_e32 vcc, 4, v4
+; LOOP-NEXT:    buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64
+; LOOP-NEXT:    s_waitcnt vmcnt(3)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
+; LOOP-NEXT:    s_waitcnt vmcnt(2)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
+; LOOP-NEXT:    s_waitcnt vmcnt(1)
+; LOOP-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; LOOP-NEXT:    s_waitcnt vmcnt(0)
-; LOOP-NEXT:    buffer_store_byte v8, v[6:7], s[0:3], 0 addr64
-; LOOP-NEXT:    s_cbranch_vccnz .LBB0_2
-; LOOP-NEXT:  .LBB0_3: ; %Flow17
-; LOOP-NEXT:    s_andn2_saveexec_b64 s[0:1], s[4:5]
-; LOOP-NEXT:    s_cbranch_execz .LBB0_6
-; LOOP-NEXT:  ; %bb.4: ; %copy_backwards
-; LOOP-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
-; LOOP-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; LOOP-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
-; LOOP-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; LOOP-NEXT:    s_mov_b32 s0, -4
-; LOOP-NEXT:    s_mov_b32 s6, 0
-; LOOP-NEXT:    s_mov_b32 s7, 0xf000
-; LOOP-NEXT:    s_mov_b64 s[4:5], 0
-; LOOP-NEXT:    v_mov_b32_e32 v4, s0
-; LOOP-NEXT:  .LBB0_5: ; %copy_backwards_loop
-; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; LOOP-NEXT:    v_or_b32_e32 v2, v3, v2
+; LOOP-NEXT:    v_or_b32_e32 v3, v4, v5
+; LOOP-NEXT:    v_or_b32_e32 v2, v3, v2
+; LOOP-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; LOOP-NEXT:    v_bfe_u32 v4, v2, 8, 8
+; LOOP-NEXT:    buffer_store_byte v2, v[0:1], s[0:3], 0 addr64
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    buffer_load_ubyte v5, v[2:3], s[4:7], 0 addr64
-; LOOP-NEXT:    v_add_i32_e32 v4, vcc, 1, v4
-; LOOP-NEXT:    s_and_b64 vcc, vcc, exec
-; LOOP-NEXT:    s_waitcnt vmcnt(0)
-; LOOP-NEXT:    buffer_store_byte v5, v[0:1], s[4:7], 0 addr64
-; LOOP-NEXT:    v_add_i32_e64 v0, s[0:1], -1, v0
-; LOOP-NEXT:    v_addc_u32_e64 v1, s[0:1], -1, v1, s[0:1]
-; LOOP-NEXT:    v_add_i32_e64 v2, s[0:1], -1, v2
-; LOOP-NEXT:    v_addc_u32_e64 v3, s[0:1], -1, v3, s[0:1]
-; LOOP-NEXT:    s_cbranch_vccz .LBB0_5
-; LOOP-NEXT:  .LBB0_6: ; %memmove_done
+; LOOP-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:2
+; LOOP-NEXT:    buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:3
 ; LOOP-NEXT:    s_endpgm
 ;
 ; UNROLL-LABEL: memmove_p1i8:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index 5cb57ee112b3a..9e2e37a886d1f 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -17,6 +17,9 @@ declare void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) nocapture, ptr addrspace(3)
 
 declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture readonly, i64, i1) #1
 declare void @llvm.memmove.p1.p3.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
+declare void @llvm.memmove.p0.p3.i32(ptr nocapture writeonly, ptr addrspace(3) nocapture readonly, i32, i1 immarg) #1
+declare void @llvm.memmove.p3.p0.i32(ptr addrspace(3) nocapture writeonly, ptr nocapture readonly, i32, i1 immarg) #1
+declare void @llvm.memmove.p3.p3.i32(ptr addrspace(3) nocapture writeonly, ptr addrspace(3) nocapture readonly, i32, i1 immarg) #1
 declare void @llvm.memmove.p5.p5.i32(ptr addrspace(5) nocapture, ptr addrspace(5) nocapture readonly, i32, i1) #1
 declare void @llvm.memmove.p3.p5.i32(ptr addrspace(3) nocapture, ptr addrspace(5) nocapture readonly, i32, i1) #1
 declare void @llvm.memmove.p5.p3.i32(ptr addrspace(5) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
@@ -87,30 +90,25 @@ define amdgpu_kernel void @max_size_small_static_memmove_caller0(ptr addrspace(1
 ;
 ; ALL-LABEL: @max_size_small_static_memmove_caller0(
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
-; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 1024, 0
-; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL:       copy_backwards:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL:       copy_backwards_loop:
-; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 1024, [[COPY_BACKWARDS]] ]
-; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP1]], 1
-; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, ptr addrspace(1) [[TMP2]], align 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    store i8 [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL:       copy_forward:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL:       copy_forward_loop:
-; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr addrspace(1) [[TMP5]], align 1
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    store i8 [[ELEMENT2]], ptr addrspace(1) [[TMP6]], align 1
-; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 1024
-; ALL-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 64, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 1
+; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP2]], align 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 64
+; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
 ;
@@ -121,30 +119,37 @@ define amdgpu_kernel void @max_size_small_static_memmove_caller0(ptr addrspace(1
 define amdgpu_kernel void @min_size_large_static_memmove_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
 ; OPT-LABEL: @min_size_large_static_memmove_caller0(
 ; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
-; OPT-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 1025, 0
-; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; OPT:       copy_backwards:
-; OPT-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; OPT:       copy_backwards_loop:
-; OPT-NEXT:    [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 1025, [[COPY_BACKWARDS]] ]
-; OPT-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP1]], 1
-; OPT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR]]
-; OPT-NEXT:    [[ELEMENT:%.*]] = load i8, ptr addrspace(1) [[TMP2]], align 1
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR]]
-; OPT-NEXT:    store i8 [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
-; OPT-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; OPT-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; OPT:       copy_forward:
-; OPT-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; OPT:       copy_forward_loop:
-; OPT-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR1]]
-; OPT-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr addrspace(1) [[TMP5]], align 1
-; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR1]]
-; OPT-NEXT:    store i8 [[ELEMENT2]], ptr addrspace(1) [[TMP6]], align 1
-; OPT-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; OPT-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 1025
-; OPT-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; OPT:       memmove_bwd_residual:
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT:    [[TMP2:%.*]] = load i8, ptr addrspace(1) [[TMP1]], align 1
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT:    store i8 [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
+; OPT-NEXT:    br label [[MEMMOVE_BWD_LOOP:%.*]]
+; OPT:       memmove_bwd_loop:
+; OPT-NEXT:    [[TMP4:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 64, [[MEMMOVE_BWD_RESIDUAL]] ]
+; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP4]], 1
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; OPT:       memmove_fwd_loop:
+; OPT-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP10:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ]
+; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP8]], align 1
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP9]], align 1
+; OPT-NEXT:    [[TMP10]] = add i64 [[FWD_INDEX]], 1
+; OPT-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[TMP10]], 64
+; OPT-NEXT:    br i1 [[TMP11]], label [[MEMMOVE_FWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP]]
+; OPT:       memmove_fwd_residual:
+; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT:    [[TMP13:%.*]] = load i8, ptr addrspace(1) [[TMP12]], align 1
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT:    store i8 [[TMP13]], ptr addrspace(1) [[TMP14]], align 1
+; OPT-NEXT:    br label [[MEMMOVE_DONE]]
 ; OPT:       memmove_done:
 ; OPT-NEXT:    ret void
 ;
@@ -1333,30 +1338,25 @@ define amdgpu_kernel void @memmove_flat_align1_global_align1(ptr %dst, ptr addrs
 ; ALL-LABEL: @memmove_flat_align1_global_align1(
 ; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[SRC:%.*]] to ptr
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
-; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
-; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL:       copy_backwards:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL:       copy_backwards_loop:
-; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
-; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP2]], 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    store i8 [[ELEMENT]], ptr [[TMP4]], align 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL:       copy_forward:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL:       copy_forward_loop:
-; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
-; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
-; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
 ;
@@ -1372,30 +1372,25 @@ define amdgpu_kernel void @memmove_global_align1_flat_align1(ptr addrspace(1) %d
 ; ALL-LABEL: @memmove_global_align1_flat_align1(
 ; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[DST:%.*]] to ptr
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
-; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
-; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL:       copy_backwards:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL:       copy_backwards_loop:
-; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
-; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP2]], 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    store i8 [[ELEMENT]], ptr [[TMP4]], align 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL:       copy_forward:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL:       copy_forward_loop:
-; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
-; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
-; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
 ;
@@ -1411,30 +1406,25 @@ define amdgpu_kernel void @memmove_flat_align1_private_align1(ptr %dst, ptr addr
 ; ALL-LABEL: @memmove_flat_align1_private_align1(
 ; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[SRC:%.*]] to ptr
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
-; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
-; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL:       copy_backwards:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL:       copy_backwards_loop:
-; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
-; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP2]], 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    store i8 [[ELEMENT]], ptr [[TMP4]], align 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL:       copy_forward:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL:       copy_forward_loop:
-; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
-; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
-; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
 ;
@@ -1450,30 +1440,25 @@ define amdgpu_kernel void @memmove_private_align1_flat_align1(ptr addrspace(5) %
 ; ALL-LABEL: @memmove_private_align1_flat_align1(
 ; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DST:%.*]] to ptr
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
-; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 256, 0
-; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL:       copy_backwards:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL:       copy_backwards_loop:
-; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 256, [[COPY_BACKWARDS]] ]
-; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP2]], 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load i8, ptr [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    store i8 [[ELEMENT]], ptr [[TMP4]], align 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL:       copy_forward:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL:       copy_forward_loop:
-; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    [[ELEMENT2:%.*]] = load i8, ptr [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    store i8 [[ELEMENT2]], ptr [[TMP7]], align 1
-; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 256
-; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
 ;
@@ -1734,6 +1719,477 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
   ret void
 }
 
+
+define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %dst, ptr addrspace(3) %src) {
+; MAX1024-LABEL: @memmove_flat_align1_local_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p0.p3.i32(ptr [[DST:%.*]], ptr addrspace(3) [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_flat_align1_local_align1(
+; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(3) [[SRC:%.*]] to ptr
+; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 32, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i32 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 32
+; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; ALL:       memmove_done:
+; ALL-NEXT:    ret void
+;
+  call void @llvm.memmove.p0.p3.i32(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 256, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 %size) {
+; OPT-LABEL: @memmove_flat_align1_local_align1_unknown_size(
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
+; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
+; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(3) [[SRC:%.*]] to ptr
+; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP4]], [[DST:%.*]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
+; OPT:       memmove_copy_backwards:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_bwd_residual_loop:
+; OPT-NEXT:    [[TMP5:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
+; OPT-NEXT:    [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP5]], 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load i8, ptr addrspace(3) [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT]], ptr [[TMP7]], align 1
+; OPT-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
+; OPT-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
+; OPT:       memmove_bwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_bwd_main_loop:
+; OPT-NEXT:    [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 1
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP10]], align 1
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    store <2 x i32> [[ELEMENT1]], ptr [[TMP11]], align 1
+; OPT-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
+; OPT:       memmove_copy_forward:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_fwd_main_loop:
+; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP13]], align 1
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    store <2 x i32> [[ELEMENT2]], ptr [[TMP14]], align 1
+; OPT-NEXT:    [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 1
+; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP1]]
+; OPT-NEXT:    br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
+; OPT:       memmove_fwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_fwd_residual_loop:
+; OPT-NEXT:    [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
+; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT3:%.*]] = load i8, ptr addrspace(3) [[TMP17]], align 1
+; OPT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT3]], ptr [[TMP18]], align 1
+; OPT-NEXT:    [[TMP19]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
+; OPT-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP19]], [[SIZE]]
+; OPT-NEXT:    br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
+; OPT:       memmove_done:
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p0.p3.i32(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 %size, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_local_align1_flat_align1(ptr addrspace(3) %dst, ptr addrspace(0) %src) {
+; MAX1024-LABEL: @memmove_local_align1_flat_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p3.p0.i32(ptr addrspace(3) [[DST:%.*]], ptr [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_local_align1_flat_align1(
+; ALL-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(3) [[DST:%.*]] to ptr
+; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 32, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP2]], 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <2 x i32>, ptr [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP4]], align 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i32 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 32
+; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; ALL:       memmove_done:
+; ALL-NEXT:    ret void
+;
+  call void @llvm.memmove.p3.p0.i32(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 256, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 %size) {
+; OPT-LABEL: @memmove_local_align1_flat_align1_unknown_size(
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
+; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
+; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(3) [[DST:%.*]] to ptr
+; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP4]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
+; OPT:       memmove_copy_backwards:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_bwd_residual_loop:
+; OPT-NEXT:    [[TMP5:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
+; OPT-NEXT:    [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP5]], 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load i8, ptr [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT]], ptr addrspace(3) [[TMP7]], align 1
+; OPT-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
+; OPT-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
+; OPT:       memmove_bwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_bwd_main_loop:
+; OPT-NEXT:    [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 1
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP10]], align 1
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP11]], align 1
+; OPT-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
+; OPT:       memmove_copy_forward:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_fwd_main_loop:
+; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT2:%.*]] = load <2 x i32>, ptr [[TMP13]], align 1
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP14]], align 1
+; OPT-NEXT:    [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 1
+; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP1]]
+; OPT-NEXT:    br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
+; OPT:       memmove_fwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_fwd_residual_loop:
+; OPT-NEXT:    [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
+; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT3:%.*]] = load i8, ptr [[TMP17]], align 1
+; OPT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT3]], ptr addrspace(3) [[TMP18]], align 1
+; OPT-NEXT:    [[TMP19]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
+; OPT-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP19]], [[SIZE]]
+; OPT-NEXT:    br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
+; OPT:       memmove_done:
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p3.p0.i32(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 %size, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_local_align1_local_align1(ptr addrspace(3) %dst, ptr addrspace(3) %src) {
+; MAX1024-LABEL: @memmove_local_align1_local_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p3.p3.i32(ptr addrspace(3) [[DST:%.*]], ptr addrspace(3) [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_local_align1_local_align1(
+; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[DST:%.*]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 32, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP1]], 1
+; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP2]], align 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7]] = add i32 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 32
+; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; ALL:       memmove_done:
+; ALL-NEXT:    ret void
+;
+  call void @llvm.memmove.p3.p3.i32(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 256, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %size) {
+; OPT-LABEL: @memmove_local_align1_local_align1_unknown_size(
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
+; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
+; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[DST:%.*]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
+; OPT:       memmove_copy_backwards:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_bwd_residual_loop:
+; OPT-NEXT:    [[TMP4:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
+; OPT-NEXT:    [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP4]], 1
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load i8, ptr addrspace(3) [[TMP5]], align 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT]], ptr addrspace(3) [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
+; OPT-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
+; OPT:       memmove_bwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_bwd_main_loop:
+; OPT-NEXT:    [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 1
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP9]], align 1
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP10]], align 1
+; OPT-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP11]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
+; OPT:       memmove_copy_forward:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_fwd_main_loop:
+; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP14:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
+; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP12]], align 1
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP13]], align 1
+; OPT-NEXT:    [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 1
+; OPT-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP1]]
+; OPT-NEXT:    br i1 [[TMP15]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
+; OPT:       memmove_fwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_fwd_residual_loop:
+; OPT-NEXT:    [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP18:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
+; OPT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT3:%.*]] = load i8, ptr addrspace(3) [[TMP16]], align 1
+; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT3]], ptr addrspace(3) [[TMP17]], align 1
+; OPT-NEXT:    [[TMP18]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
+; OPT-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP18]], [[SIZE]]
+; OPT-NEXT:    br i1 [[TMP19]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
+; OPT:       memmove_done:
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p3.p3.i32(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %size, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_private_align1_private_align1(ptr addrspace(5) %dst, ptr addrspace(5) %src) {
+; MAX1024-LABEL: @memmove_private_align1_private_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_private_align1_private_align1(
+; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[DST:%.*]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP1]], 1
+; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP2]], align 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP5]], align 1
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7]] = add i32 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 16
+; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; ALL:       memmove_done:
+; ALL-NEXT:    ret void
+;
+  call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) %dst, ptr addrspace(5) %src, i32 256, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_private_align1_private_align1_unknown_size(ptr addrspace(5) %dst, ptr addrspace(5) %src, i32 %size) {
+; OPT-LABEL: @memmove_private_align1_private_align1_unknown_size(
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 15
+; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
+; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[DST:%.*]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
+; OPT:       memmove_copy_backwards:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_bwd_residual_loop:
+; OPT-NEXT:    [[TMP4:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
+; OPT-NEXT:    [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP4]], 1
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load i8, ptr addrspace(5) [[TMP5]], align 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT]], ptr addrspace(5) [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
+; OPT-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
+; OPT:       memmove_bwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_bwd_main_loop:
+; OPT-NEXT:    [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 1
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP9]], align 1
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP10]], align 1
+; OPT-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP11]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
+; OPT:       memmove_copy_forward:
+; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
+; OPT:       memmove_fwd_main_loop:
+; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP14:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
+; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP12]], align 1
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT2]], ptr addrspace(5) [[TMP13]], align 1
+; OPT-NEXT:    [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 1
+; OPT-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP1]]
+; OPT-NEXT:    br i1 [[TMP15]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
+; OPT:       memmove_fwd_middle:
+; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
+; OPT:       memmove_fwd_residual_loop:
+; OPT-NEXT:    [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP18:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
+; OPT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    [[ELEMENT3:%.*]] = load i8, ptr addrspace(5) [[TMP16]], align 1
+; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
+; OPT-NEXT:    store i8 [[ELEMENT3]], ptr addrspace(5) [[TMP17]], align 1
+; OPT-NEXT:    [[TMP18]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
+; OPT-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP18]], [[SIZE]]
+; OPT-NEXT:    br i1 [[TMP19]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
+; OPT:       memmove_done:
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) %dst, ptr addrspace(5) %src, i32 %size, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_global_align4_static_residual_empty(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
+; OPT-LABEL: @memmove_global_align4_static_residual_empty(
+; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; OPT:       memmove_bwd_loop:
+; OPT-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 65, [[TMP0:%.*]] ]
+; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 1
+; OPT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP2]], align 1
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
+; OPT-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; OPT:       memmove_fwd_loop:
+; OPT-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 1
+; OPT-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 65
+; OPT-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
+; OPT:       memmove_done:
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1040, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_global_align4_static_residual_full(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
+; OPT-LABEL: @memmove_global_align4_static_residual_full(
+; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
+; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; OPT:       memmove_bwd_residual:
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038
+; OPT-NEXT:    [[TMP2:%.*]] = load i8, ptr addrspace(1) [[TMP1]], align 1
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038
+; OPT-NEXT:    store i8 [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
+; OPT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC]], i64 518
+; OPT-NEXT:    [[TMP5:%.*]] = load i16, ptr addrspace(1) [[TMP4]], align 1
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST]], i64 518
+; OPT-NEXT:    store i16 [[TMP5]], ptr addrspace(1) [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[SRC]], i64 258
+; OPT-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 1
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DST]], i64 258
+; OPT-NEXT:    store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 1
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[SRC]], i64 128
+; OPT-NEXT:    [[TMP11:%.*]] = load i64, ptr addrspace(1) [[TMP10]], align 1
+; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[DST]], i64 128
+; OPT-NEXT:    store i64 [[TMP11]], ptr addrspace(1) [[TMP12]], align 1
+; OPT-NEXT:    br label [[MEMMOVE_BWD_LOOP:%.*]]
+; OPT:       memmove_bwd_loop:
+; OPT-NEXT:    [[TMP13:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 64, [[MEMMOVE_BWD_RESIDUAL]] ]
+; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP13]], 1
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP14]], align 1
+; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP15]], align 1
+; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; OPT-NEXT:    br i1 [[TMP16]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; OPT:       memmove_fwd_loop:
+; OPT-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ]
+; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP17]], align 1
+; OPT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP18]], align 1
+; OPT-NEXT:    [[TMP19]] = add i64 [[FWD_INDEX]], 1
+; OPT-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 64
+; OPT-NEXT:    br i1 [[TMP20]], label [[MEMMOVE_FWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP]]
+; OPT:       memmove_fwd_residual:
+; OPT-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[SRC]], i64 128
+; OPT-NEXT:    [[TMP22:%.*]] = load i64, ptr addrspace(1) [[TMP21]], align 1
+; OPT-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[DST]], i64 128
+; OPT-NEXT:    store i64 [[TMP22]], ptr addrspace(1) [[TMP23]], align 1
+; OPT-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[SRC]], i64 258
+; OPT-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(1) [[TMP24]], align 1
+; OPT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DST]], i64 258
+; OPT-NEXT:    store i32 [[TMP25]], ptr addrspace(1) [[TMP26]], align 1
+; OPT-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC]], i64 518
+; OPT-NEXT:    [[TMP28:%.*]] = load i16, ptr addrspace(1) [[TMP27]], align 1
+; OPT-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST]], i64 518
+; OPT-NEXT:    store i16 [[TMP28]], ptr addrspace(1) [[TMP29]], align 1
+; OPT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038
+; OPT-NEXT:    [[TMP31:%.*]] = load i8, ptr addrspace(1) [[TMP30]], align 1
+; OPT-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038
+; OPT-NEXT:    store i8 [[TMP31]], ptr addrspace(1) [[TMP32]], align 1
+; OPT-NEXT:    br label [[MEMMOVE_DONE]]
+; OPT:       memmove_done:
+; OPT-NEXT:    ret void
+;
+  call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1039, i1 false)
+  ret void
+}
+
 define void @test_umin(i64 %0, i64 %idxprom, ptr %x, ptr %y) {
 ; OPT-LABEL: @test_umin(
 ; OPT-NEXT:  entry:
@@ -1783,30 +2239,25 @@ define amdgpu_kernel void @memmove_volatile(ptr addrspace(1) %dst, ptr addrspace
 ;
 ; ALL-LABEL: @memmove_volatile(
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
-; ALL-NEXT:    [[COMPARE_N_TO_0:%.*]] = icmp eq i64 64, 0
-; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
-; ALL:       copy_backwards:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
-; ALL:       copy_backwards_loop:
-; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 64, [[COPY_BACKWARDS]] ]
-; ALL-NEXT:    [[INDEX_PTR]] = sub i64 [[TMP1]], 1
-; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load volatile i8, ptr addrspace(1) [[TMP2]], align 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR]]
-; ALL-NEXT:    store volatile i8 [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
-; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
-; ALL:       copy_forward:
-; ALL-NEXT:    br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
-; ALL:       copy_forward_loop:
-; ALL-NEXT:    [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
-; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    [[ELEMENT2:%.*]] = load volatile i8, ptr addrspace(1) [[TMP5]], align 1
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[INDEX_PTR1]]
-; ALL-NEXT:    store volatile i8 [[ELEMENT2]], ptr addrspace(1) [[TMP6]], align 1
-; ALL-NEXT:    [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
-; ALL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 64
-; ALL-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
+; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
+; ALL:       memmove_bwd_loop:
+; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 4, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 1
+; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT:%.*]] = load volatile <4 x i32>, ptr addrspace(1) [[TMP2]], align 1
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    store volatile <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
+; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
+; ALL:       memmove_fwd_loop:
+; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load volatile <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    store volatile <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 1
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 4
+; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
 ;
diff --git a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
index afa7fde6c842b..59e755aee215c 100644
--- a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
@@ -154,7 +154,7 @@ entry:
 ; IR-LABEL:   @memmove_caller
 ; IR:         icmp ult ptr %src, %dst
 ; IR:         [[PHIVAL:%[0-9a-zA-Z_]+]] = phi i64
-; IR-NEXT:    %index_ptr = sub i64 [[PHIVAL]], 1
+; IR-NEXT:    %bwd_main_index = sub i64 [[PHIVAL]], 1
 ; IR:         [[FWDPHIVAL:%[0-9a-zA-Z_]+]] = phi i64
 ; IR:         {{%[0-9a-zA-Z_]+}} = add i64 [[FWDPHIVAL]], 1
 

>From 1079d9c9c00822790a4226ecac5ae6fababdcac1 Mon Sep 17 00:00:00 2001
From: R <rqou at berkeley.edu>
Date: Fri, 26 Jul 2024 08:06:45 +0100
Subject: [PATCH 236/259] [llvm-driver] Add a newline at the end of the help
 message (#99425)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This was constantly messing with my terminal prompt 😹
---
 llvm/tools/llvm-driver/llvm-driver.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/tools/llvm-driver/llvm-driver.cpp b/llvm/tools/llvm-driver/llvm-driver.cpp
index 53a8b9357e378..14ce162faee46 100644
--- a/llvm/tools/llvm-driver/llvm-driver.cpp
+++ b/llvm/tools/llvm-driver/llvm-driver.cpp
@@ -33,7 +33,7 @@ static void printHelpMessage() {
                << subcommands
                << "\n  Type \"llvm <subcommand> --help\" to get more help on a "
                   "specific subcommand\n\n"
-               << "OPTIONS:\n\n  --help - Display this message";
+               << "OPTIONS:\n\n  --help - Display this message\n";
 }
 
 static int findTool(int Argc, char **Argv, const char *Argv0) {

>From 988fd956e776bad8057a8b7ee051a524308c097b Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav at gmail.com>
Date: Fri, 26 Jul 2024 11:09:54 +0400
Subject: [PATCH 237/259] [clang] Implement `__builtin_is_virtual_base_of()`
 intrinsic (#100393)

This patch adds compiler support for
[P2985R0](https://wg21.link/p2985r0) "A type trait for detecting virtual
base classes".
Like we recently did with `__is_layout_compatible()` and
`__is_pointer_interconvertible_base_of()`, we support it only in C++
mode, and reject VLAs.

Resolves #98310.
---
 clang/docs/LanguageExtensions.rst        |  1 +
 clang/docs/ReleaseNotes.rst              |  3 +
 clang/include/clang/Basic/TokenKinds.def |  1 +
 clang/lib/Sema/SemaExprCXX.cpp           | 26 +++++++++
 clang/test/SemaCXX/type-traits.cpp       | 74 +++++++++++++++++++++++-
 5 files changed, 103 insertions(+), 2 deletions(-)

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 81784c75081ba..a747464582e77 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -1546,6 +1546,7 @@ The following type trait primitives are supported by Clang. Those traits marked
 * ``__array_extent(type, dim)`` (Embarcadero):
   The ``dim``'th array bound in the type ``type``, or ``0`` if
   ``dim >= __array_rank(type)``.
+* ``__builtin_is_virtual_base_of`` (C++, GNU, Microsoft)
 * ``__can_pass_in_regs`` (C++)
   Returns whether a class can be passed in registers under the current
   ABI. This type can only be applied to unqualified class types.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 5dddd8f1c5af5..b165f6e65636d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -81,6 +81,9 @@ C++23 Feature Support
 C++2c Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
+- Add ``__builtin_is_virtual_base_of`` intrinsic, which supports
+  `P2985R0 A type trait for detecting virtual base classes <https://wg21.link/p2985r0>`_
+
 Resolutions to C++ Defect Reports
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index 8c54661e65cf4..7e638dc1ddcdb 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -501,6 +501,7 @@ TYPE_TRAIT_1(__has_trivial_move_assign, HasTrivialMoveAssign, KEYCXX)
 TYPE_TRAIT_1(__has_trivial_move_constructor, HasTrivialMoveConstructor, KEYCXX)
 
 // GNU and MS Type Traits
+TYPE_TRAIT_2(__builtin_is_virtual_base_of, IsVirtualBaseOf, KEYCXX)
 TYPE_TRAIT_1(__has_nothrow_assign, HasNothrowAssign, KEYCXX)
 TYPE_TRAIT_1(__has_nothrow_copy, HasNothrowCopy, KEYCXX)
 TYPE_TRAIT_1(__has_nothrow_constructor, HasNothrowConstructor, KEYCXX)
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 51c4a36808fce..029969c1722b7 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -6030,6 +6030,32 @@ static bool EvaluateBinaryTypeTrait(Sema &Self, TypeTrait BTT, const TypeSourceI
     return cast<CXXRecordDecl>(rhsRecord->getDecl())
       ->isDerivedFrom(cast<CXXRecordDecl>(lhsRecord->getDecl()));
   }
+  case BTT_IsVirtualBaseOf: {
+    const RecordType *BaseRecord = LhsT->getAs<RecordType>();
+    const RecordType *DerivedRecord = RhsT->getAs<RecordType>();
+
+    if (!BaseRecord || !DerivedRecord) {
+      DiagnoseVLAInCXXTypeTrait(Self, Lhs,
+                                tok::kw___builtin_is_virtual_base_of);
+      DiagnoseVLAInCXXTypeTrait(Self, Rhs,
+                                tok::kw___builtin_is_virtual_base_of);
+      return false;
+    }
+
+    if (BaseRecord->isUnionType() || DerivedRecord->isUnionType())
+      return false;
+
+    if (!BaseRecord->isStructureOrClassType() ||
+        !DerivedRecord->isStructureOrClassType())
+      return false;
+
+    if (Self.RequireCompleteType(Rhs->getTypeLoc().getBeginLoc(), RhsT,
+                                 diag::err_incomplete_type))
+      return false;
+
+    return cast<CXXRecordDecl>(DerivedRecord->getDecl())
+        ->isVirtuallyDerivedFrom(cast<CXXRecordDecl>(BaseRecord->getDecl()));
+  }
   case BTT_IsSame:
     return Self.Context.hasSameType(LhsT, RhsT);
   case BTT_TypeCompatible: {
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index 23b07cac13eaf..e131212bb1071 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -2402,11 +2402,11 @@ template<typename T> struct DerivedB : BaseA<T> { };
 template<typename T> struct CrazyDerived : T { };
 
 
-class class_forward; // expected-note 2 {{forward declaration of 'class_forward'}}
+class class_forward; // expected-note 4 {{forward declaration of 'class_forward'}}
 
 template <class T> class DerivedTemp : Base {};
 template <class T> class NonderivedTemp {};
-template <class T> class UndefinedTemp; // expected-note {{declared here}}
+template <class T> class UndefinedTemp; // expected-note 2 {{declared here}}
 
 void is_base_of() {
   static_assert(__is_base_of(Base, Derived));
@@ -2457,6 +2457,76 @@ void is_base_of() {
   static_assert(!__is_base_of(DerivedB<int>, BaseA<int>));
 }
 
+struct DerivedTransitiveViaNonVirtual : Derived3 {};
+struct DerivedTransitiveViaVirtual : virtual Derived3 {};
+
+template <typename T>
+struct CrazyDerivedVirtual : virtual T {};
+
+struct DerivedPrivate : private virtual Base {};
+struct DerivedProtected : protected virtual Base {};
+struct DerivedPrivatePrivate : private DerivedPrivate {};
+struct DerivedPrivateProtected : private DerivedProtected {};
+struct DerivedProtectedPrivate : protected DerivedProtected {};
+struct DerivedProtectedProtected : protected DerivedProtected {};
+
+void is_virtual_base_of(int n) {
+  static_assert(!__builtin_is_virtual_base_of(Base, Derived));
+  static_assert(!__builtin_is_virtual_base_of(const Base, Derived));
+  static_assert(!__builtin_is_virtual_base_of(Derived, Base));
+  static_assert(!__builtin_is_virtual_base_of(Derived, int));
+  static_assert(!__builtin_is_virtual_base_of(Base, Base));
+  static_assert(!__builtin_is_virtual_base_of(Base, Derived3));
+  static_assert(!__builtin_is_virtual_base_of(Derived, Derived3));
+  static_assert(__builtin_is_virtual_base_of(Derived2b, Derived3));
+  static_assert(__builtin_is_virtual_base_of(Derived2a, Derived3));
+  static_assert(!__builtin_is_virtual_base_of(BaseA<int>, DerivedB<int>));
+  static_assert(!__builtin_is_virtual_base_of(DerivedB<int>, BaseA<int>));
+  static_assert(!__builtin_is_virtual_base_of(Union, Union));
+  static_assert(!__builtin_is_virtual_base_of(Empty, Empty));
+  static_assert(!__builtin_is_virtual_base_of(class_forward, class_forward)); // expected-error {{incomplete type 'class_forward' where a complete type is required}}
+  static_assert(!__builtin_is_virtual_base_of(Empty, class_forward)); // expected-error {{incomplete type 'class_forward' where a complete type is required}}
+  static_assert(!__builtin_is_virtual_base_of(class_forward, Empty));
+  static_assert(!__builtin_is_virtual_base_of(Base&, Derived&));
+  static_assert(!__builtin_is_virtual_base_of(Base[10], Derived[10]));
+  static_assert(!__builtin_is_virtual_base_of(Base[n], Derived[n])); // expected-error 2 {{variable length arrays are not supported in '__builtin_is_virtual_base_of'}}
+  static_assert(!__builtin_is_virtual_base_of(int, int));
+  static_assert(!__builtin_is_virtual_base_of(int[], int[]));
+  static_assert(!__builtin_is_virtual_base_of(long, int));
+  static_assert(!__builtin_is_virtual_base_of(Base, DerivedTemp<int>));
+  static_assert(!__builtin_is_virtual_base_of(Base, NonderivedTemp<int>));
+  static_assert(!__builtin_is_virtual_base_of(Base, UndefinedTemp<int>)); // expected-error {{implicit instantiation of undefined template 'UndefinedTemp<int>'}}
+  static_assert(__builtin_is_virtual_base_of(Base, DerivedPrivate));
+  static_assert(__builtin_is_virtual_base_of(Base, DerivedProtected));
+  static_assert(__builtin_is_virtual_base_of(Base, DerivedPrivatePrivate));
+  static_assert(__builtin_is_virtual_base_of(Base, DerivedPrivateProtected));
+  static_assert(__builtin_is_virtual_base_of(Base, DerivedProtectedPrivate));
+  static_assert(__builtin_is_virtual_base_of(Base, DerivedProtectedProtected));
+  static_assert(__builtin_is_virtual_base_of(Derived2a, DerivedTransitiveViaNonVirtual));
+  static_assert(__builtin_is_virtual_base_of(Derived2b, DerivedTransitiveViaNonVirtual));
+  static_assert(__builtin_is_virtual_base_of(Derived2a, DerivedTransitiveViaVirtual));
+  static_assert(__builtin_is_virtual_base_of(Derived2b, DerivedTransitiveViaVirtual));
+  static_assert(!__builtin_is_virtual_base_of(Base, CrazyDerived<Base>));
+  static_assert(!__builtin_is_virtual_base_of(CrazyDerived<Base>, Base));
+  static_assert(__builtin_is_virtual_base_of(Base, CrazyDerivedVirtual<Base>));
+  static_assert(!__builtin_is_virtual_base_of(CrazyDerivedVirtual<Base>, Base));
+
+  static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, IncompleteUnion));
+  static_assert(!__builtin_is_virtual_base_of(Union, IncompleteUnion));
+  static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, Union));
+  static_assert(!__builtin_is_virtual_base_of(IncompleteStruct, IncompleteUnion));
+  static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, IncompleteStruct));
+  static_assert(!__builtin_is_virtual_base_of(Empty, IncompleteUnion));
+  static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, Empty));
+  static_assert(!__builtin_is_virtual_base_of(int, IncompleteUnion));
+  static_assert(!__builtin_is_virtual_base_of(IncompleteUnion, int));
+  static_assert(!__builtin_is_virtual_base_of(Empty, Union));
+  static_assert(!__builtin_is_virtual_base_of(Union, Empty));
+  static_assert(!__builtin_is_virtual_base_of(int, Empty));
+  static_assert(!__builtin_is_virtual_base_of(Union, int));
+  static_assert(!__builtin_is_virtual_base_of(IncompleteStruct, IncompleteStruct[n])); // expected-error {{variable length arrays are not supported in '__builtin_is_virtual_base_of'}}
+}
+
 template<class T, class U>
 class TemplateClass {};
 

>From ed1c67b129bdb4343a147e1d72968e7d1f0cd1cf Mon Sep 17 00:00:00 2001
From: Nathan James <n.james93 at hotmail.co.uk>
Date: Fri, 26 Jul 2024 08:16:08 +0100
Subject: [PATCH 238/259] Add a description parameter to the add_new_check
 script (#100111)

Adds a description parameter that automatically fills in the Release
notes and first line of the checks documentation. If omitted the usually
FIXME markers are left in their place.
---
 clang-tools-extra/clang-tidy/add_new_check.py | 41 +++++++++++++++----
 1 file changed, 34 insertions(+), 7 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/add_new_check.py b/clang-tools-extra/clang-tidy/add_new_check.py
index 3a62df1f510ba..1ce2019ee33fe 100755
--- a/clang-tools-extra/clang-tidy/add_new_check.py
+++ b/clang-tools-extra/clang-tidy/add_new_check.py
@@ -16,6 +16,7 @@
 import os
 import re
 import sys
+import textwrap
 
 # Adapts the module's CMakelist file. Returns 'True' if it could add a new
 # entry and 'False' if the entry already existed.
@@ -53,7 +54,14 @@ def adapt_cmake(module_path, check_name_camel):
 
 
 # Adds a header for the new check.
-def write_header(module_path, module, namespace, check_name, check_name_camel):
+def write_header(
+    module_path, module, namespace, check_name, check_name_camel, description
+):
+    wrapped_desc = "\n".join(
+        textwrap.wrap(
+            description, width=80, initial_indent="/// ", subsequent_indent="/// "
+        )
+    )
     filename = os.path.join(module_path, check_name_camel) + ".h"
     print("Creating %s..." % filename)
     with io.open(filename, "w", encoding="utf8", newline="\n") as f:
@@ -85,7 +93,7 @@ def write_header(module_path, module, namespace, check_name, check_name_camel):
 
 namespace clang::tidy::%(namespace)s {
 
-/// FIXME: Write a short description.
+%(description)s
 ///
 /// For the user-facing documentation see:
 /// http://clang.llvm.org/extra/clang-tidy/checks/%(module)s/%(check_name)s.html
@@ -107,6 +115,7 @@ class %(check_name_camel)s : public ClangTidyCheck {
                 "check_name": check_name,
                 "module": module,
                 "namespace": namespace,
+                "description": wrapped_desc,
             }
         )
 
@@ -235,7 +244,12 @@ def adapt_module(module_path, module, check_name, check_name_camel):
 
 
 # Adds a release notes entry.
-def add_release_notes(module_path, module, check_name):
+def add_release_notes(module_path, module, check_name, description):
+    wrapped_desc = "\n".join(
+        textwrap.wrap(
+            description, width=80, initial_indent="  ", subsequent_indent="  "
+        )
+    )
     check_name_dashes = module + "-" + check_name
     filename = os.path.normpath(
         os.path.join(module_path, "../../docs/ReleaseNotes.rst")
@@ -281,10 +295,10 @@ def add_release_notes(module_path, module, check_name):
                             """- New :doc:`%s
   <clang-tidy/checks/%s/%s>` check.
 
-  FIXME: add release notes.
+%s
 
 """
-                            % (check_name_dashes, module, check_name)
+                            % (check_name_dashes, module, check_name, wrapped_desc)
                         )
                         note_added = True
 
@@ -612,6 +626,13 @@ def main():
         default="c++",
         metavar="LANG",
     )
+    parser.add_argument(
+        "--description",
+        "-d",
+        help="short description of what the check does",
+        default="FIXME: Write a short description",
+        type=str,
+    )
     parser.add_argument(
         "module",
         nargs="?",
@@ -652,10 +673,16 @@ def main():
     else:
         namespace = module
 
-    write_header(module_path, module, namespace, check_name, check_name_camel)
+    description = args.description
+    if not description.endswith("."):
+        description += "."
+
+    write_header(
+        module_path, module, namespace, check_name, check_name_camel, description
+    )
     write_implementation(module_path, module, namespace, check_name_camel)
     adapt_module(module_path, module, check_name, check_name_camel)
-    add_release_notes(module_path, module, check_name)
+    add_release_notes(module_path, module, check_name, description)
     test_extension = language_to_extension.get(args.language)
     write_test(module_path, module, check_name, test_extension)
     write_docs(module_path, module, check_name)

>From 9482a8385b94f2c32379fb0b946516ff54261b32 Mon Sep 17 00:00:00 2001
From: Edd Dawson <edd.dawson at sony.com>
Date: Fri, 26 Jul 2024 08:21:42 +0100
Subject: [PATCH 239/259] [PS5][Driver] Ensure stack sizes are emitted by LTO
 (#100592)

... when requested.

Upstreaming a private patch.

SIE tracker: TOOLCHAIN-16575
---
 clang/lib/Driver/ToolChains/PS4CPU.cpp | 4 ++++
 clang/test/Driver/stack-size-section.c | 1 +
 2 files changed, 5 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index 813a0fbedd2b1..f883f29f0c8ca 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -266,6 +266,10 @@ void tools::PS5cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (UseJMC)
     AddLTOFlag("-enable-jmc-instrument");
 
+  if (Args.hasFlag(options::OPT_fstack_size_section,
+                   options::OPT_fno_stack_size_section, false))
+    AddLTOFlag("-stack-size-section");
+
   if (Arg *A = Args.getLastArg(options::OPT_fcrash_diagnostics_dir))
     AddLTOFlag(Twine("-crash-diagnostics-dir=") + A->getValue());
 
diff --git a/clang/test/Driver/stack-size-section.c b/clang/test/Driver/stack-size-section.c
index 71b9f85692b99..7cd41e491a817 100644
--- a/clang/test/Driver/stack-size-section.c
+++ b/clang/test/Driver/stack-size-section.c
@@ -14,6 +14,7 @@
 
 // RUN: %clang -### --target=x86_64-linux-gnu -flto -fstack-size-section %s 2>&1 | FileCheck %s --check-prefix=LTO
 // RUN: %clang -### --target=x86_64-linux-gnu -flto -fstack-size-section -fno-stack-size-section %s 2>&1 | FileCheck %s --check-prefix=LTO-NO
+// RUN: %clang -### --target=x86_64-sie-ps5 -fstack-size-section %s 2>&1 | FileCheck %s --check-prefix=LTO
 
 // LTO: "-plugin-opt=-stack-size-section"
 // LTO-NO-NOT: "-plugin-opt=-stack-size-section"

>From 3921900198653d205753b2d85dc31ca5bc009fdd Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 26 Jul 2024 11:27:12 +0400
Subject: [PATCH 240/259] CodeGen: Remove UsesMSVCFloatingPoint from
 MachineModuleInfo (#100368)

This is only used by x86 and only used in the AsmPrinter module pass. I
think implementing this by looking at the underlying IR types instead
of the selected instructions is a pretty horrifying implementation,
but it's still available in the AsmPrinter.

This is https://reviews.llvm.org/D123933 resurrected.

I still don't know what the point of emitting _fltused is, but this
approach of looking at the IR types probably isn't the right way to
do this in the first place. If the intent is report any FP instructions,
this will miss any implicitly introduced ones during codegen. Also don't
know why just unconditionally emitting it isn't an option.

The last review mentioned the ARMs might want to emit this, but I'm
not going to go fix that. If someone wants to emit this on ARM, they
can move this to a common helper or analysis somewhere.
---
 llvm/include/llvm/CodeGen/MachineModuleInfo.h |  8 -----
 llvm/lib/CodeGen/MachineModuleInfo.cpp        |  1 -
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp | 27 -----------------
 llvm/lib/Target/X86/X86AsmPrinter.cpp         | 30 ++++++++++++++++++-
 4 files changed, 29 insertions(+), 37 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineModuleInfo.h b/llvm/include/llvm/CodeGen/MachineModuleInfo.h
index 97b439c726b0a..dfa0e993ec06a 100644
--- a/llvm/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineModuleInfo.h
@@ -116,10 +116,6 @@ class MachineModuleInfo {
   /// True if debugging information is available in this module.
   bool DbgInfoAvailable = false;
 
-  /// True if this module is being built for windows/msvc, and uses floating
-  /// point.  This is used to emit an undefined reference to _fltused.
-  bool UsesMSVCFloatingPoint = false;
-
   /// Maps IR Functions to their corresponding MachineFunctions.
   DenseMap<const Function*, std::unique_ptr<MachineFunction>> MachineFunctions;
   /// Next unique number available for a MachineFunction.
@@ -189,10 +185,6 @@ class MachineModuleInfo {
   /// Returns true if valid debug info is present.
   bool hasDebugInfo() const { return DbgInfoAvailable; }
 
-  bool usesMSVCFloatingPoint() const { return UsesMSVCFloatingPoint; }
-
-  void setUsesMSVCFloatingPoint(bool b) { UsesMSVCFloatingPoint = b; }
-
   /// \name Exception Handling
   /// \{
 
diff --git a/llvm/lib/CodeGen/MachineModuleInfo.cpp b/llvm/lib/CodeGen/MachineModuleInfo.cpp
index 088e76029f1a3..150ab363c8fcd 100644
--- a/llvm/lib/CodeGen/MachineModuleInfo.cpp
+++ b/llvm/lib/CodeGen/MachineModuleInfo.cpp
@@ -28,7 +28,6 @@ void MachineModuleInfo::initialize() {
   ObjFileMMI = nullptr;
   CurCallSite = 0;
   NextFnNum = 0;
-  UsesMSVCFloatingPoint = false;
   DbgInfoAvailable = false;
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 401d23b22adcd..84331d257a3d0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -417,30 +417,6 @@ void SelectionDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-static void computeUsesMSVCFloatingPoint(const Triple &TT, const Function &F,
-                                         MachineModuleInfo &MMI) {
-  // Only needed for MSVC
-  if (!TT.isWindowsMSVCEnvironment())
-    return;
-
-  // If it's already set, nothing to do.
-  if (MMI.usesMSVCFloatingPoint())
-    return;
-
-  for (const Instruction &I : instructions(F)) {
-    if (I.getType()->isFPOrFPVectorTy()) {
-      MMI.setUsesMSVCFloatingPoint(true);
-      return;
-    }
-    for (const auto &Op : I.operands()) {
-      if (Op->getType()->isFPOrFPVectorTy()) {
-        MMI.setUsesMSVCFloatingPoint(true);
-        return;
-      }
-    }
-  }
-}
-
 PreservedAnalyses
 SelectionDAGISelPass::run(MachineFunction &MF,
                           MachineFunctionAnalysisManager &MFAM) {
@@ -802,9 +778,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
     }
   }
 
-  // Determine if floating point is used for msvc
-  computeUsesMSVCFloatingPoint(TM.getTargetTriple(), Fn, *CurDAG->getMMI());
-
   // Release function-specific state. SDB and CurDAG are already cleared
   // at this point.
   FuncInfo->clear();
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 0c2c6bf7f8b70..957eb21cf4f8c 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGenTypes/MachineValueType.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
@@ -975,6 +976,33 @@ static void emitNonLazyStubs(MachineModuleInfo *MMI, MCStreamer &OutStreamer) {
   }
 }
 
+/// True if this module is being built for windows/msvc, and uses floating
+/// point. This is used to emit an undefined reference to _fltused. This is
+/// needed in Windows kernel or driver contexts to find and prevent code from
+/// modifying non-GPR registers.
+///
+/// TODO: It would be better if this was computed from MIR by looking for
+/// selected floating-point instructions.
+static bool usesMSVCFloatingPoint(const Triple &TT, const Module &M) {
+  // Only needed for MSVC
+  if (!TT.isWindowsMSVCEnvironment())
+    return false;
+
+  for (const Function &F : M) {
+    for (const Instruction &I : instructions(F)) {
+      if (I.getType()->isFPOrFPVectorTy())
+        return true;
+
+      for (const auto &Op : I.operands()) {
+        if (Op->getType()->isFPOrFPVectorTy())
+          return true;
+      }
+    }
+  }
+
+  return false;
+}
+
 void X86AsmPrinter::emitEndOfAsmFile(Module &M) {
   const Triple &TT = TM.getTargetTriple();
 
@@ -993,7 +1021,7 @@ void X86AsmPrinter::emitEndOfAsmFile(Module &M) {
     // safe to set.
     OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   } else if (TT.isOSBinFormatCOFF()) {
-    if (MMI->usesMSVCFloatingPoint()) {
+    if (usesMSVCFloatingPoint(TT, M)) {
       // In Windows' libcmt.lib, there is a file which is linked in only if the
       // symbol _fltused is referenced. Linking this in causes some
       // side-effects:

>From b009f4685838f7ceff3cb9ff58fed0e25eb93d88 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <serge.guelton at telecom-bretagne.eu>
Date: Fri, 26 Jul 2024 07:34:39 +0000
Subject: [PATCH 241/259] [Flang][Runtime] Explicitly convert shift value to
 SubscriptValue (#99822)

Shift value are within the range of SubscriptValue but the API forces
them to 64bits. This assumption doesn't hold for 32bit machine, so add
an explicit cast.
---
 flang/runtime/transformational.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/flang/runtime/transformational.cpp b/flang/runtime/transformational.cpp
index cf1e61c0844d8..b6b204be4418c 100644
--- a/flang/runtime/transformational.cpp
+++ b/flang/runtime/transformational.cpp
@@ -508,7 +508,8 @@ void RTDEF(CshiftVector)(Descriptor &result, const Descriptor &source,
   SubscriptValue lb{sourceDim.LowerBound()};
   for (SubscriptValue j{0}; j < extent; ++j) {
     SubscriptValue resultAt{1 + j};
-    SubscriptValue sourceAt{lb + (j + shift) % extent};
+    SubscriptValue sourceAt{
+        lb + static_cast<SubscriptValue>(j + shift) % extent};
     if (sourceAt < lb) {
       sourceAt += extent;
     }
@@ -619,7 +620,7 @@ void RTDEF(EoshiftVector)(Descriptor &result, const Descriptor &source,
   }
   SubscriptValue lb{source.GetDimension(0).LowerBound()};
   for (SubscriptValue j{1}; j <= extent; ++j) {
-    SubscriptValue sourceAt{lb + j - 1 + shift};
+    SubscriptValue sourceAt{lb + j - 1 + static_cast<SubscriptValue>(shift)};
     if (sourceAt >= lb && sourceAt < lb + extent) {
       CopyElement(result, &j, source, &sourceAt, terminator);
     } else if (boundary) {

>From a82032918cd445e5750e171f57d4f3d7096c021a Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Fri, 26 Jul 2024 00:42:24 -0700
Subject: [PATCH 242/259] [AMDGPU] Remove -wavefrontsize32 and -wavefrontsize64
 from GFX10+ tests (NFC) (#100711)

They are no longer needed after the patch: [AMDGPU] Remove wavefrontsize
feature from GFX10: https://github.com/llvm/llvm-project/pull/98400
The exception is when "target-features" are set to "+wavefrontsize32" or
"+wavefrontsize64", we still need to remove a wavefrontsize feature
before add a different one to make sure only one of them are present.
---
 llvm/test/CodeGen/AMDGPU/amdpal-elf.ll        |  8 +++----
 .../AMDGPU/atomic_optimizations_buffer.ll     | 12 +++++-----
 .../atomic_optimizations_global_pointer.ll    | 24 +++++++++----------
 .../atomic_optimizations_local_pointer.ll     | 16 ++++++-------
 .../atomic_optimizations_pixelshader.ll       |  8 +++----
 .../AMDGPU/atomic_optimizations_raw_buffer.ll | 12 +++++-----
 .../atomic_optimizations_struct_buffer.ll     | 12 +++++-----
 llvm/test/CodeGen/AMDGPU/basic-branch.ll      |  4 ++--
 llvm/test/CodeGen/AMDGPU/early-term.mir       |  4 ++--
 .../AMDGPU/global_atomics_scan_fadd.ll        | 16 ++++++-------
 .../AMDGPU/global_atomics_scan_fmax.ll        | 16 ++++++-------
 .../AMDGPU/global_atomics_scan_fmin.ll        | 16 ++++++-------
 .../AMDGPU/global_atomics_scan_fsub.ll        | 16 ++++++-------
 .../CodeGen/AMDGPU/hazard-hidden-bundle.mir   |  2 +-
 llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir |  2 +-
 .../AMDGPU/hsa-generic-target-features.ll     | 14 +++++------
 .../AMDGPU/hsa-metadata-wavefrontsize.ll      |  8 +++----
 llvm/test/CodeGen/AMDGPU/hsa.ll               |  8 +++----
 .../CodeGen/AMDGPU/huge-private-buffer.ll     |  6 ++---
 .../AMDGPU/insert-delay-alu-literal.mir       |  2 +-
 .../CodeGen/AMDGPU/insert-delay-alu-parse.mir |  2 +-
 llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir |  2 +-
 .../isel-amdgcn-cs-chain-intrinsic-w32.ll     |  8 +++----
 .../isel-amdgcn-cs-chain-intrinsic-w64.ll     |  8 +++----
 .../CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll | 16 ++++++-------
 .../isel-amdgpu-cs-chain-preserve-cc.ll       | 16 ++++++-------
 .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll  |  4 ++--
 .../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll  |  4 ++--
 .../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll  |  4 ++--
 .../CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll   |  8 +++----
 .../AMDGPU/llvm.amdgcn.init.exec.wave32.ll    | 16 ++++++-------
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll  |  4 ++--
 .../CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll     |  2 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll     |  2 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll  |  4 ++--
 ...uf-legalize-operands-non-ptr-intrinsics.ll |  8 +++----
 .../CodeGen/AMDGPU/mubuf-legalize-operands.ll |  8 +++----
 .../AMDGPU/mubuf-legalize-operands.mir        |  8 +++----
 .../AMDGPU/multi-divergent-exit-region.ll     |  2 +-
 .../AMDGPU/s_or_saveexec_xor_combine.mir      |  4 ++--
 llvm/test/CodeGen/AMDGPU/sgpr-spill.mir       |  2 +-
 .../AMDGPU/sink-after-control-flow-postra.mir |  2 +-
 llvm/test/CodeGen/AMDGPU/skip-if-dead.ll      |  4 ++--
 llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir  |  4 ++--
 .../CodeGen/AMDGPU/spill-special-sgpr.mir     |  4 ++--
 llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll       |  2 +-
 .../AMDGPU/vccz-corrupt-bug-workaround.mir    |  4 ++--
 .../CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir |  4 ++--
 .../CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir  |  4 ++--
 .../AMDGPU/verify-constant-bus-violations.mir |  8 +++----
 .../test/CodeGen/AMDGPU/verify-vopd-gfx12.mir |  4 ++--
 llvm/test/CodeGen/AMDGPU/verify-vopd.mir      |  4 ++--
 ...wmma-gfx12-w64-f16-f32-matrix-modifiers.ll |  2 +-
 .../test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll |  2 +-
 .../AMDGPU/wmma-gfx12-w64-iu-modifiers.ll     |  2 +-
 .../AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll |  2 +-
 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll    |  2 +-
 .../CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir |  2 +-
 llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll    |  2 +-
 llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll  |  2 +-
 llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll  |  2 +-
 llvm/test/CodeGen/AMDGPU/wqm.ll               |  2 +-
 62 files changed, 201 insertions(+), 201 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
index 91abbfff7f2de..74a7ca243b56a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
@@ -1,9 +1,9 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=kaveri -filetype=obj | llvm-readobj -S --sd --syms - | FileCheck --check-prefix=ELF %s
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdpal -mcpu=kaveri | llvm-readobj -S --sd --syms - | FileCheck %s --check-prefix=ELF
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
 
 ; ELF: Section {
 ; ELF: Name: .text
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 6ec6f6aebf2b1..17fe3adc22169 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -2,12 +2,12 @@
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index da88b6792f269..16f3ff4be6b50 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2,21 +2,21 @@
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_DPP %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index eb05613da0453..c7296185422ce 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -2,17 +2,17 @@
 ; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 29704959fc176..f67fcd6e0caf5 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -2,10 +2,10 @@
 ; RUN: llc -mtriple=amdgcn-- - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s
 ; RUN: llc  -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
 
 declare i1 @llvm.amdgcn.wqm.vote(i1)
 declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 7ae4766584e00..3a2efadac067d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -2,12 +2,12 @@
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 62e50b2597c17..d0c0b62c78e42 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -2,12 +2,12 @@
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
index 2a7206c6ac43f..bc206656cf181 100644
--- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -O0 -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
 ; RUN: llc -O0 -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope  -check-prefix=GCNNOOPT -check-prefix=GCN %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global,-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s
 
diff --git a/llvm/test/CodeGen/AMDGPU/early-term.mir b/llvm/test/CodeGen/AMDGPU/early-term.mir
index 99ea18c37050b..77bc9729ee845 100644
--- a/llvm/test/CodeGen/AMDGPU/early-term.mir
+++ b/llvm/test/CodeGen/AMDGPU/early-term.mir
@@ -1,6 +1,6 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-late-branch-lowering -verify-machineinstrs  %s -o - | FileCheck -check-prefixes=GCN,GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs  %s -o - | FileCheck -check-prefixes=GCN,GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs  %s -o - | FileCheck -check-prefixes=GCN,GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs  %s -o - | FileCheck -check-prefixes=GCN,GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs  %s -o - | FileCheck -check-prefixes=GCN,GFX11 %s
 
 --- |
   define amdgpu_ps void @early_term_scc0_end_block() {
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index c05f9c679979d..600c35b6862f6 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
 
 declare float @div.float.value()
 declare double @div.double.value()
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 46f0bb0393885..954d810d7aa4e 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32, -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
 
 declare float @div.float.value()
 declare float @div.double.value()
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index bd5e589ec2be7..4ab86835f90d1 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
 
 declare float @div.float.value()
 declare float @div.double.value()
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 5ffa71d37164c..cbe243a949115 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
 
 declare float @div.float.value()
 declare double @div.double.value()
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
index c17feaed83f6c..e2635fc6ecbbb 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
@@ -1,6 +1,6 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx902 -mattr=+xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64,-xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64,-xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s
 
 # GCN-LABEL: name: break_smem_clause_simple_load_smrd8_ptr_hidden_bundle
 # GCN: bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
index 8d02f0614dfc3..401f6e303e796 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
@@ -1,6 +1,6 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx902 -mattr=+xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64,-xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64,-xnack -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s
 
 # GCN-LABEL: name: break_smem_clause_max_look_ahead_in_bundle
 # GCN:          S_LOAD_DWORDX2_IMM
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll b/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll
index 1cb27dc6fc45c..1801082b60030 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-generic-target-features.ll
@@ -6,13 +6,13 @@
 ; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic < %s | FileCheck -check-prefix=CU %s
 ; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic < %s | FileCheck -check-prefix=CU %s
 
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=W32 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-3-generic -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=W32 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-3-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=W32 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
-; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=W32 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-3-generic -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=W32 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-3-generic -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=W32 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx11-generic -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=W64 %s
 
 ; Checks 10.1, 10.3 and 11 generic targets allow cumode/wave64.
 
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-wavefrontsize.ll
index 3dc9f209d01e2..431ba38f60a02 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-wavefrontsize.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-32 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-64 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-32 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-64 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GCN,GFX10-32 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-64 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GCN,GFX10-32 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10-64 %s
 
 ; GCN:      amdhsa.kernels:
 ; GCN:      .name: wavefrontsize
diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index 487e62b6c3495..37476203fbfad 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -5,10 +5,10 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA-VI %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -S --sd --syms - | FileCheck --check-prefix=ELF %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri --amdhsa-code-object-version=4 | llvm-readobj -S --sd --syms - | FileCheck %s --check-prefix=ELF
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W32 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W64 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W32 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W64 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W32 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W64 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W32 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 --check-prefix=GFX10-W64 %s
 
 ; The SHT_NOTE section contains the output from the .hsa_code_object_*
 ; directives.
diff --git a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
index b9269e2b24284..2cb440b1b7a01 100644
--- a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH128K %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH256K %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH1024K %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH1024K %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH2048K %s
 
 ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir
index 18bc442ae98e0..266da50f6e543 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-literal.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-delay-alu %s -o - | FileCheck %s
 
 ---
 name: valu_dep_1
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir
index af2e6001502f6..af7ba4e3fe653 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-parse.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -start-after=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -start-after=amdgpu-insert-delay-alu %s -o - | FileCheck %s
 
 ---
 name: valu_dep_1
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
index 0bdd2494e7ad6..5c9c0d1119163 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s
 
 ---
 name: valu_dep_1
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
index 1fd0c67737a27..05fd883768480 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
 
 declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
 declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
index fa447f981bf3a..28492f1426921 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
 
 declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
 declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
index ab160ffc10ed0..acd48a64dea1f 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
 
 ; Test formal argument lowering as well as calls to amdgpu_gfx functions.
 
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index ba946fe00a8c5..c62b4e565078e 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF32 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF64 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF32 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF64 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF64 %s
 
 ; We only care about which physical registers the parameters are copied from;
 ; the function bodies are just some arbitrary uses.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index 3ff34c0c2eaff..047b35b8c0f9d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 < %s | FileCheck %s
 
 declare i32 @llvm.amdgcn.ballot.i32(i1)
 declare i32 @llvm.ctpop.i32(i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
index 0e659b758cd0f..88e3929819313 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
 
 declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1))
 declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
index d941830e8dafc..a5841e04f6e07 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
 
 declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1))
 declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
index 3a540bdf5b53d..b61ca566bdfad 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}full_mask:
 ; GCN: s_mov_b64 exec, -1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll
index d3961c04838cf..d4ae040ce6128 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.wave32.ll
@@ -1,11 +1,11 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
 
 ; GCN-LABEL: {{^}}test_init_exec:
 ; GFX1032: s_mov_b32 exec_lo, 0x12345
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
index 63d272607563c..e0dacd47e51c8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
 
 ; GCN-LABEL: {{^}}gs_const:
 ; GCN-NOT: v_cmpx
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
index a538364bf2ec4..3d5e6dca45618 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
 
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
index 368d637f8ac1b..d11aaf0036b3a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
 
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half>, <16 x half>, <4 x float>)
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16>, <16 x i16>, <4 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
index d37b4b41fac3d..684ca3aac7c31 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s
 
 define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
 ; SI-LABEL: static_exact:
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
index b5ee6689f8dc3..06ebd86b79510 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX9_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
 ; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64-O0
 
 ; Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index 162c47f879465..2591ff4bd2538 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX9_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
 ; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64-O0
 
 ; Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
index ece2e1b653d34..c702de6285d9b 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
@@ -1,10 +1,10 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,ADDR64
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32
 
 # Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
 #
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index 92b2f516277bc..10d08032bf59a 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx600 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
 
 ; Add an extra verifier runs. There were some cases where invalid IR
diff --git a/llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir b/llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir
index 9ad1660264519..c1121f902982a 100644
--- a/llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir
+++ b/llvm/test/CodeGen/AMDGPU/s_or_saveexec_xor_combine.mir
@@ -1,5 +1,5 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,WAVE32 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,WAVE64 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,WAVE64 %s
 
 ---
 
@@ -124,4 +124,4 @@ body: |
   bb.0:
     liveins: $sgpr0, $sgpr1
     renamable $sgpr1 = S_MOV_B32 renamable $sgpr0
-    renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec
\ No newline at end of file
+    renamable $sgpr0 = S_OR_SAVEEXEC_B32 killed renamable $sgpr0, implicit-def $exec, implicit-def $scc, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
index f19b0a5f53e95..eaa3d22f0465c 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GCN64-MUBUF %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GCN32-MUBUF %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GCN32-MUBUF %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GCN64-FLATSCR %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -filetype=obj -verify-machineinstrs -start-before=prologepilog %s -o /dev/null
 # Check not crashing when emitting ISA
diff --git a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow-postra.mir b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow-postra.mir
index 266e67364c82b..ff87c28bb7ec5 100644
--- a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow-postra.mir
+++ b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow-postra.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass=postra-machine-sink -mattr=-wavefrontsize32,+wavefrontsize64 -o - %s | FileCheck -check-prefixes=GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass=postra-machine-sink -mattr=+wavefrontsize64 -o - %s | FileCheck -check-prefixes=GFX10 %s
 
 # Ensure that PostRA Machine Sink does not sink instructions
 # past block prologues which would overwrite their uses.
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index a440b87efc81f..eebd32cd67e6e 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,GFX10-WAVE64 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,GFX10-WAVE64 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX10-WAVE32 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
 
 define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 {
 ; GCN-LABEL: test_kill_depth_0_imm_pos:
diff --git a/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir b/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir
index 0f692107711df..2f1eeb82bf648 100644
--- a/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN %s
 
 # GCN-LABEL: name: hazard_smem_war
 # GCN:        S_LOAD_DWORD_IMM
diff --git a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
index 537aca12bdb70..639bf6a6d550c 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
@@ -1,7 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=GFX11 %s
 
 --- |
   define amdgpu_kernel void @check_vcc() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
index 4b9b5f9ffdf84..39c7538738eb1 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-wavefrontsize32,+wavefrontsize64 --global-isel=0 -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=+wavefrontsize64 --global-isel=0 -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK %s
 
 define amdgpu_kernel void @icmp_test() {
 ; CHECK-LABEL: icmp_test:
diff --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
index e143611ff9b01..17e3d93ed393b 100644
--- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -1,7 +1,7 @@
 # RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=tahiti -o - %s | FileCheck %s -check-prefixes=CHECK,SI
 # RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx900 -o - %s | FileCheck %s -check-prefixes=CHECK,GFX9
-# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -o - %s | FileCheck %s
-# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -o - %s | FileCheck %s
+# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
+# RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s
 ---
 # CHECK-LABEL: name: vccz_corrupt_workaround
 # CHECK: $vcc = V_CMP_EQ_F32
diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
index 4592498199d18..47f13cb5ebe16 100644
--- a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
+++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1010 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1030 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1010 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1030 %s
 ---
 
 # After the Optimize exec masking (post-RA) pass, there's a change of having v_cmpx instructions
diff --git a/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir b/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir
index 645d473819139..e97c518da1e3c 100644
--- a/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
 
 # GCN-LABEL: name: hazard_vcmpx_smov_exec_lo
 # GCN:        $sgpr0 = S_MOV_B32 $exec_lo
diff --git a/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir b/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir
index 81d17a8fd0f90..76713dd2996dc 100644
--- a/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir
+++ b/llvm/test/CodeGen/AMDGPU/verify-constant-bus-violations.mir
@@ -1,9 +1,9 @@
 # RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX9-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
 # RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX9-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -implicit-check-not="Bad machine code" -check-prefix=GFX10PLUS-ERR %s
 
 # GFX9-ERR: *** Bad machine code: VOP* instruction violates constant bus restriction ***
 # GFX9-ERR: $vgpr0 = V_CNDMASK_B32_e64 0, $sgpr0, 0, -1, killed $sgpr0_sgpr1, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir b/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir
index 6c55183bb5287..bbd0817d5995a 100644
--- a/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/verify-vopd-gfx12.mir
@@ -1,5 +1,5 @@
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
 
 # GFX12-ERR: *** Bad machine code: VOP* instruction violates constant bus restriction ***
 # GFX12-ERR: $vgpr2, $vgpr3 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx12 $sgpr0, $vgpr0, $sgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $vcc_lo, implicit $vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/verify-vopd.mir b/llvm/test/CodeGen/AMDGPU/verify-vopd.mir
index dc7d4afa85741..b310f55d5b86b 100644
--- a/llvm/test/CodeGen/AMDGPU/verify-vopd.mir
+++ b/llvm/test/CodeGen/AMDGPU/verify-vopd.mir
@@ -1,5 +1,5 @@
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 --passes='machine-function(verify)' -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
 
 # GFX11-ERR: *** Bad machine code: VOP* instruction violates constant bus restriction ***
 # GFX11-ERR: $vgpr2, $vgpr3 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx11 $sgpr0, $vgpr0, $sgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $vcc_lo, implicit $vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
index ab1121a705529..645f4981ba317 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
index 829d77018ff96..ad1d66e448b24 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
index 161d222d10ff7..c52b079aec2f7 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
index 511a116a78be5..ebbb3d1fe7dea 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
index 5fde11cb4b1b1..0f8df5a6e3d2d 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
index 277db33e940dd..4073964e2b038 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
 
 # D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
 #  $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
index 801bb06aacb63..2efbd3277f209 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s
 
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half>, <16 x half>, <4 x float>)
 
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
index 12b65986cf1c7..ac218a2b19c67 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
 
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
index b43969dd4f5c0..9ec2d8f33edaa 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
 
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half>, <16 x half>, <4 x float>)
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16>, <16 x i16>, <4 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 11003c4c9edfd..8f052efa51100 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-W64 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10-W32 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GFX10-W32 %s
 
 ; Check that WQM isn't triggered by image load/store intrinsics.
 define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) {

>From ed4e75d5e5ada30c37c57df032378a77e6dd598e Mon Sep 17 00:00:00 2001
From: Pengcheng Wang <wangpengcheng.pp at bytedance.com>
Date: Fri, 26 Jul 2024 15:47:47 +0800
Subject: [PATCH 243/259] [CodeGen] Remove AA parameter of isSafeToMove
 (#100691)

This `AA` parameter is not used and for most uses they just pass
a nullptr.

The use of `AA` was removed since 8d0383e.
---
 llvm/include/llvm/CodeGen/MachineInstr.h               | 2 +-
 llvm/lib/CodeGen/BranchFolding.cpp                     | 4 ++--
 llvm/lib/CodeGen/DeadMachineInstructionElim.cpp        | 2 +-
 llvm/lib/CodeGen/EarlyIfConversion.cpp                 | 2 +-
 llvm/lib/CodeGen/GlobalISel/Utils.cpp                  | 2 +-
 llvm/lib/CodeGen/IfConversion.cpp                      | 2 +-
 llvm/lib/CodeGen/LiveRangeEdit.cpp                     | 4 ++--
 llvm/lib/CodeGen/LiveRangeShrink.cpp                   | 2 +-
 llvm/lib/CodeGen/MachineInstr.cpp                      | 2 +-
 llvm/lib/CodeGen/MachineLICM.cpp                       | 2 +-
 llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp          | 3 +--
 llvm/lib/CodeGen/MachineSink.cpp                       | 8 ++++----
 llvm/lib/CodeGen/ModuloSchedule.cpp                    | 2 +-
 llvm/lib/CodeGen/RegisterCoalescer.cpp                 | 2 +-
 llvm/lib/CodeGen/TwoAddressInstructionPass.cpp         | 4 ++--
 llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp | 2 +-
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp               | 2 +-
 llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp         | 2 +-
 llvm/lib/Target/Hexagon/HexagonGenInsert.cpp           | 2 +-
 llvm/lib/Target/Lanai/LanaiInstrInfo.cpp               | 2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp               | 2 +-
 llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp           | 3 +--
 llvm/lib/Target/X86/X86InstrInfo.cpp                   | 2 +-
 23 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 229f5159fd6ef..04c8144f2fe7a 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -1723,7 +1723,7 @@ class MachineInstr
   /// Return true if it is safe to move this instruction. If
   /// SawStore is set to true, it means that there is a store (or call) between
   /// the instruction's location and its intended destination.
-  bool isSafeToMove(AAResults *AA, bool &SawStore) const;
+  bool isSafeToMove(bool &SawStore) const;
 
   /// Returns true if this instruction's memory access aliases the memory
   /// access of Other.
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index 92a03eb52e35d..1dc278586f117 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -1891,7 +1891,7 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
   // Also avoid moving code above predicated instruction since it's hard to
   // reason about register liveness with predicated instruction.
   bool DontMoveAcrossStore = true;
-  if (!PI->isSafeToMove(nullptr, DontMoveAcrossStore) || TII->isPredicated(*PI))
+  if (!PI->isSafeToMove(DontMoveAcrossStore) || TII->isPredicated(*PI))
     return MBB->end();
 
   // Find out what registers are live. Note this routine is ignoring other live
@@ -2015,7 +2015,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       break;
 
     bool DontMoveAcrossStore = true;
-    if (!TIB->isSafeToMove(nullptr, DontMoveAcrossStore))
+    if (!TIB->isSafeToMove(DontMoveAcrossStore))
       break;
 
     // Remove kills from ActiveDefsSet, these registers had short live ranges.
diff --git a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
index 578854cdb4a5d..7fc25cd889a0d 100644
--- a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -92,7 +92,7 @@ bool DeadMachineInstructionElimImpl::isDead(const MachineInstr *MI) const {
 
   // Don't delete instructions with side effects.
   bool SawStore = false;
-  if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI())
+  if (!MI->isSafeToMove(SawStore) && !MI->isPHI())
     return false;
 
   // Examine each operand.
diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp
index a5c99498921db..d506c625d8ca5 100644
--- a/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -235,7 +235,7 @@ bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
 
     // We never speculate stores, so an AA pointer isn't necessary.
     bool DontMoveAcrossStore = true;
-    if (!MI.isSafeToMove(nullptr, DontMoveAcrossStore)) {
+    if (!MI.isSafeToMove(DontMoveAcrossStore)) {
       LLVM_DEBUG(dbgs() << "Can't speculate: " << MI);
       return false;
     }
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index c906f3a7c9227..080664843aaeb 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -236,7 +236,7 @@ bool llvm::isTriviallyDead(const MachineInstr &MI,
   // If we can move an instruction, we can remove it.  Otherwise, it has
   // a side-effect of some sort.
   bool SawStore = false;
-  if (!MI.isSafeToMove(/*AA=*/nullptr, SawStore) && !MI.isPHI())
+  if (!MI.isSafeToMove(SawStore) && !MI.isPHI())
     return false;
 
   // Instructions without side-effects are dead iff they only define dead vregs.
diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp
index f3789569b78fb..ba5605ad5cb4a 100644
--- a/llvm/lib/CodeGen/IfConversion.cpp
+++ b/llvm/lib/CodeGen/IfConversion.cpp
@@ -2097,7 +2097,7 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
 static bool MaySpeculate(const MachineInstr &MI,
                          SmallSet<MCPhysReg, 4> &LaterRedefs) {
   bool SawStore = true;
-  if (!MI.isSafeToMove(nullptr, SawStore))
+  if (!MI.isSafeToMove(SawStore))
     return false;
 
   for (const MachineOperand &MO : MI.operands()) {
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 7b7b5459ad7b2..c3c581d42236e 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -238,7 +238,7 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   // We also need to make sure it is safe to move the load.
   // Assume there are stores between DefMI and UseMI.
   bool SawStore = true;
-  if (!DefMI->isSafeToMove(nullptr, SawStore))
+  if (!DefMI->isSafeToMove(SawStore))
     return false;
 
   LLVM_DEBUG(dbgs() << "Try to fold single def: " << *DefMI
@@ -300,7 +300,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
 
   // Use the same criteria as DeadMachineInstructionElim.
   bool SawStore = false;
-  if (!MI->isSafeToMove(nullptr, SawStore)) {
+  if (!MI->isSafeToMove(SawStore)) {
     LLVM_DEBUG(dbgs() << "Can't delete: " << Idx << '\t' << *MI);
     return;
   }
diff --git a/llvm/lib/CodeGen/LiveRangeShrink.cpp b/llvm/lib/CodeGen/LiveRangeShrink.cpp
index af7d6c4403b8f..3e3e3e51bfe9c 100644
--- a/llvm/lib/CodeGen/LiveRangeShrink.cpp
+++ b/llvm/lib/CodeGen/LiveRangeShrink.cpp
@@ -153,7 +153,7 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
           }
       }
 
-      if (!MI.isSafeToMove(nullptr, SawStore)) {
+      if (!MI.isSafeToMove(SawStore)) {
         // If MI has side effects, it should become a barrier for code motion.
         // IOM is rebuild from the next instruction to prevent later
         // instructions from being moved before this MI.
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index be64e9c8452f6..583bc1bc4a42c 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1293,7 +1293,7 @@ void MachineInstr::substituteRegister(Register FromReg, Register ToReg,
 /// isSafeToMove - Return true if it is safe to move this instruction. If
 /// SawStore is set to true, it means that there is a store (or call) between
 /// the instruction's location and its intended destination.
-bool MachineInstr::isSafeToMove(AAResults *AA, bool &SawStore) const {
+bool MachineInstr::isSafeToMove(bool &SawStore) const {
   // Ignore stuff that we obviously can't move.
   //
   // Treat volatile loads as stores. This is not strictly necessary for
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d75df2adc440d..1e4bf4bba5622 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1075,7 +1075,7 @@ static bool isCopyFeedingInvariantStore(const MachineInstr &MI,
 bool MachineLICMBase::IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop) {
   // Check if it's safe to move the instruction.
   bool DontMoveAcrossStore = !HoistConstLoads || !AllowedToHoistLoads[CurLoop];
-  if ((!I.isSafeToMove(AA, DontMoveAcrossStore)) &&
+  if ((!I.isSafeToMove(DontMoveAcrossStore)) &&
       !(HoistConstStores && isInvariantStore(I, TRI, MRI))) {
     LLVM_DEBUG(dbgs() << "LICM: Instruction not safe to move.\n");
     return false;
diff --git a/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp b/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
index 1f596cd1bd2ec..9d650e4bea7e1 100644
--- a/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
+++ b/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
@@ -158,8 +158,7 @@ static bool isCandidate(const MachineInstr *MI, Register &DefedReg,
                         Register FrameReg) {
   DefedReg = MCRegister::NoRegister;
   bool SawStore = true;
-  if (!MI->isSafeToMove(nullptr, SawStore) || MI->isImplicitDef() ||
-      MI->isInlineAsm())
+  if (!MI->isSafeToMove(SawStore) || MI->isImplicitDef() || MI->isInlineAsm())
     return false;
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 4b3ff57fb478a..f10b98cebd133 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -374,7 +374,7 @@ bool MachineSinking::PerformSinkAndFold(MachineInstr &MI,
 
   // Check if it's safe to move the instruction.
   bool SawStore = true;
-  if (!MI.isSafeToMove(AA, SawStore))
+  if (!MI.isSafeToMove(SawStore))
     return false;
 
   // Convergent operations may not be made control-dependent on additional
@@ -687,7 +687,7 @@ void MachineSinking::FindCycleSinkCandidates(
       continue;
     }
     bool DontMoveAcrossStore = true;
-    if (!MI.isSafeToMove(AA, DontMoveAcrossStore)) {
+    if (!MI.isSafeToMove(DontMoveAcrossStore)) {
       LLVM_DEBUG(dbgs() << "CycleSink: Instruction not safe to move.\n");
       continue;
     }
@@ -1654,7 +1654,7 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
     return false;
 
   // Check if it's safe to move the instruction.
-  if (!MI.isSafeToMove(AA, SawStore))
+  if (!MI.isSafeToMove(SawStore))
     return false;
 
   // Convergent operations may not be made control-dependent on additional
@@ -1705,7 +1705,7 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
     bool TryBreak = false;
     bool Store =
         MI.mayLoad() ? hasStoreBetween(ParentBlock, SuccToSinkTo, MI) : true;
-    if (!MI.isSafeToMove(AA, Store)) {
+    if (!MI.isSafeToMove(Store)) {
       LLVM_DEBUG(dbgs() << " *** NOTE: Won't sink load along critical edge.\n");
       TryBreak = true;
     }
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index 0f29ebe3ee798..dae0cb2c900e5 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -739,7 +739,7 @@ void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB,
       bool SawStore = false;
       // Check if it's safe to remove the instruction due to side effects.
       // We can, and want to, remove Phis here.
-      if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
+      if (!MI->isSafeToMove(SawStore) && !MI->isPHI()) {
         ++MI;
         continue;
       }
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 043ea20191487..f6c53f3051c2f 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -1320,7 +1320,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   if (!definesFullReg(*DefMI, SrcReg))
     return false;
   bool SawStore = false;
-  if (!DefMI->isSafeToMove(AA, SawStore))
+  if (!DefMI->isSafeToMove(SawStore))
     return false;
   const MCInstrDesc &MCID = DefMI->getDesc();
   if (MCID.getNumDefs() != 1)
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 6f7905c8988dc..fb6274b09919b 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -948,7 +948,7 @@ bool TwoAddressInstructionImpl::rescheduleMIBelowKill(
     return false;
 
   bool SeenStore = true;
-  if (!MI->isSafeToMove(AA, SeenStore))
+  if (!MI->isSafeToMove(SeenStore))
     return false;
 
   if (TII->getInstrLatency(InstrItins, *MI) > 1)
@@ -1131,7 +1131,7 @@ bool TwoAddressInstructionImpl::rescheduleKillAboveMI(
     return false;
 
   bool SeenStore = true;
-  if (!KillMI->isSafeToMove(AA, SeenStore))
+  if (!KillMI->isSafeToMove(SeenStore))
     return false;
 
   SmallVector<Register, 2> Uses;
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index fc4d05c4b8b21..49e5211af50cc 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -416,7 +416,7 @@ bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
 
     // We never speculate stores, so an AA pointer isn't necessary.
     bool DontMoveAcrossStore = true;
-    if (!I.isSafeToMove(nullptr, DontMoveAcrossStore)) {
+    if (!I.isSafeToMove(DontMoveAcrossStore)) {
       LLVM_DEBUG(dbgs() << "Can't speculate: " << I);
       return false;
     }
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 18cb516d41c1f..3b9195b4e3d85 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -2311,7 +2311,7 @@ ARMBaseInstrInfo::canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI,
       return nullptr;
   }
   bool DontMoveAcrossStores = true;
-  if (!MI->isSafeToMove(/* AliasAnalysis = */ nullptr, DontMoveAcrossStores))
+  if (!MI->isSafeToMove(DontMoveAcrossStores))
     return nullptr;
   return MI;
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 6926b02701771..1922675917752 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -1037,7 +1037,7 @@ bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) {
     if (MI->isInlineAsm())
       continue;
     // Delete PHIs if possible.
-    if (!MI->isPHI() && !MI->isSafeToMove(nullptr, Store))
+    if (!MI->isPHI() && !MI->isSafeToMove(Store))
       continue;
 
     bool AllDead = true;
diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index 8840c272057ab..5e52cf03cfbc7 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -1451,7 +1451,7 @@ bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) {
         Opc == TargetOpcode::LIFETIME_END)
       continue;
     bool Store = false;
-    if (MI->isInlineAsm() || !MI->isSafeToMove(nullptr, Store))
+    if (MI->isInlineAsm() || !MI->isSafeToMove(Store))
       continue;
 
     bool AllDead = true;
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
index b8a37435f5a64..9f3d9b7aaa6f9 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -483,7 +483,7 @@ static MachineInstr *canFoldIntoSelect(Register Reg,
       return nullptr;
   }
   bool DontMoveAcrossStores = true;
-  if (!MI->isSafeToMove(/*AliasAnalysis=*/nullptr, DontMoveAcrossStores))
+  if (!MI->isSafeToMove(DontMoveAcrossStores))
     return nullptr;
   return MI;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 1df51c1595b8d..0620c3fc12adc 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1365,7 +1365,7 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg,
       return nullptr;
   }
   bool DontMoveAcrossStores = true;
-  if (!MI->isSafeToMove(/* AliasAnalysis = */ nullptr, DontMoveAcrossStores))
+  if (!MI->isSafeToMove(DontMoveAcrossStores))
     return nullptr;
   return MI;
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 16bbfd44ef8a9..d0758891fe570 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -631,8 +631,7 @@ MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
   DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
   assert(DefMI);
   bool SawStore = false;
-  if (!DefMI->isSafeToMove(nullptr, SawStore) ||
-      !MRI->hasOneNonDBGUse(FoldAsLoadDefReg))
+  if (!DefMI->isSafeToMove(SawStore) || !MRI->hasOneNonDBGUse(FoldAsLoadDefReg))
     return nullptr;
 
   int UseOpIdx =
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index f3e6aaf35ca82..918a6089fe9a4 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -5588,7 +5588,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
   DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
   assert(DefMI);
   bool SawStore = false;
-  if (!DefMI->isSafeToMove(nullptr, SawStore))
+  if (!DefMI->isSafeToMove(SawStore))
     return nullptr;
 
   // Collect information about virtual register operands of MI.

>From 9440a49b0ff812c889927e4dc81e746b32124bb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= <balazs.keri at ericsson.com>
Date: Fri, 26 Jul 2024 10:10:36 +0200
Subject: [PATCH 244/259] [clang][analyzer] MmapWriteExecChecker improvements
 (#97078)

Read the 'mmap' flags from macro values and use a better test for the
error situation. Checker messages are improved too.
---
 .../clang/StaticAnalyzer/Checkers/Checkers.td | 12 ----
 .../Checkers/MmapWriteExecChecker.cpp         | 58 +++++++++----------
 clang/test/Analysis/analyzer-config.c         |  2 -
 clang/test/Analysis/mmap-writeexec.c          | 11 ++--
 4 files changed, 35 insertions(+), 48 deletions(-)

diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index ec5dbd28a5272..38b55a0eb0a7b 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -1045,18 +1045,6 @@ def MallocOverflowSecurityChecker : Checker<"MallocOverflow">,
 
 def MmapWriteExecChecker : Checker<"MmapWriteExec">,
   HelpText<"Warn on mmap() calls that are both writable and executable">,
-  CheckerOptions<[
-    CmdLineOption<Integer,
-                  "MmapProtExec",
-                  "Specifies the value of PROT_EXEC",
-                  "0x04",
-                  Released>,
-    CmdLineOption<Integer,
-                  "MmapProtRead",
-                  "Specifies the value of PROT_READ",
-                  "0x01",
-                  Released>
-  ]>,
   Documentation<HasDocumentation>;
 
 def ReturnPointerRangeChecker : Checker<"ReturnPtrRange">,
diff --git a/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
index cd1dd1b2fc511..4b8e5216550d9 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
@@ -21,49 +21,56 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerHelpers.h"
 
 using namespace clang;
 using namespace ento;
 
 namespace {
-class MmapWriteExecChecker : public Checker<check::PreCall> {
+class MmapWriteExecChecker
+    : public Checker<check::ASTDecl<TranslationUnitDecl>, check::PreCall> {
   CallDescription MmapFn{CDM::CLibrary, {"mmap"}, 6};
   CallDescription MprotectFn{CDM::CLibrary, {"mprotect"}, 3};
-  static int ProtWrite;
-  static int ProtExec;
-  static int ProtRead;
   const BugType BT{this, "W^X check fails, Write Exec prot flags set",
                    "Security"};
 
+  // Default values are used if definition of the flags is not found.
+  mutable int ProtRead = 0x01;
+  mutable int ProtWrite = 0x02;
+  mutable int ProtExec = 0x04;
+
 public:
+  void checkASTDecl(const TranslationUnitDecl *TU, AnalysisManager &Mgr,
+                    BugReporter &BR) const;
   void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
-  int ProtExecOv;
-  int ProtReadOv;
 };
 }
 
-int MmapWriteExecChecker::ProtWrite = 0x02;
-int MmapWriteExecChecker::ProtExec  = 0x04;
-int MmapWriteExecChecker::ProtRead  = 0x01;
+void MmapWriteExecChecker::checkASTDecl(const TranslationUnitDecl *TU,
+                                        AnalysisManager &Mgr,
+                                        BugReporter &BR) const {
+  Preprocessor &PP = Mgr.getPreprocessor();
+  const std::optional<int> FoundProtRead = tryExpandAsInteger("PROT_READ", PP);
+  const std::optional<int> FoundProtWrite =
+      tryExpandAsInteger("PROT_WRITE", PP);
+  const std::optional<int> FoundProtExec = tryExpandAsInteger("PROT_EXEC", PP);
+  if (FoundProtRead && FoundProtWrite && FoundProtExec) {
+    ProtRead = *FoundProtRead;
+    ProtWrite = *FoundProtWrite;
+    ProtExec = *FoundProtExec;
+  }
+}
 
 void MmapWriteExecChecker::checkPreCall(const CallEvent &Call,
-                                         CheckerContext &C) const {
+                                        CheckerContext &C) const {
   if (matchesAny(Call, MmapFn, MprotectFn)) {
     SVal ProtVal = Call.getArgSVal(2);
     auto ProtLoc = ProtVal.getAs<nonloc::ConcreteInt>();
     if (!ProtLoc)
       return;
     int64_t Prot = ProtLoc->getValue().getSExtValue();
-    if (ProtExecOv != ProtExec)
-      ProtExec = ProtExecOv;
-    if (ProtReadOv != ProtRead)
-      ProtRead = ProtReadOv;
-
-    // Wrong settings
-    if (ProtRead == ProtExec)
-      return;
 
-    if ((Prot & (ProtWrite | ProtExec)) == (ProtWrite | ProtExec)) {
+    if ((Prot & ProtWrite) && (Prot & ProtExec)) {
       ExplodedNode *N = C.generateNonFatalErrorNode();
       if (!N)
         return;
@@ -80,17 +87,10 @@ void MmapWriteExecChecker::checkPreCall(const CallEvent &Call,
   }
 }
 
-void ento::registerMmapWriteExecChecker(CheckerManager &mgr) {
-  MmapWriteExecChecker *Mwec =
-      mgr.registerChecker<MmapWriteExecChecker>();
-  Mwec->ProtExecOv =
-    mgr.getAnalyzerOptions()
-      .getCheckerIntegerOption(Mwec, "MmapProtExec");
-  Mwec->ProtReadOv =
-    mgr.getAnalyzerOptions()
-      .getCheckerIntegerOption(Mwec, "MmapProtRead");
+void ento::registerMmapWriteExecChecker(CheckerManager &Mgr) {
+  Mgr.registerChecker<MmapWriteExecChecker>();
 }
 
-bool ento::shouldRegisterMmapWriteExecChecker(const CheckerManager &mgr) {
+bool ento::shouldRegisterMmapWriteExecChecker(const CheckerManager &) {
   return true;
 }
diff --git a/clang/test/Analysis/analyzer-config.c b/clang/test/Analysis/analyzer-config.c
index 2a4c40005a4dc..b8dbcdd7e55af 100644
--- a/clang/test/Analysis/analyzer-config.c
+++ b/clang/test/Analysis/analyzer-config.c
@@ -9,8 +9,6 @@
 // CHECK-NEXT: alpha.clone.CloneChecker:ReportNormalClones = true
 // CHECK-NEXT: alpha.cplusplus.STLAlgorithmModeling:AggressiveStdFindModeling = false
 // CHECK-NEXT: alpha.osx.cocoa.DirectIvarAssignment:AnnotatedFunctions = false
-// CHECK-NEXT: alpha.security.MmapWriteExec:MmapProtExec = 0x04
-// CHECK-NEXT: alpha.security.MmapWriteExec:MmapProtRead = 0x01
 // CHECK-NEXT: alpha.security.taint.TaintPropagation:Config = ""
 // CHECK-NEXT: apply-fixits = false
 // CHECK-NEXT: assume-controlled-environment = false
diff --git a/clang/test/Analysis/mmap-writeexec.c b/clang/test/Analysis/mmap-writeexec.c
index 8fd86ceb9d2a2..579cc75069eec 100644
--- a/clang/test/Analysis/mmap-writeexec.c
+++ b/clang/test/Analysis/mmap-writeexec.c
@@ -1,13 +1,14 @@
-// RUN: %clang_analyze_cc1 -triple i686-unknown-linux -analyzer-checker=alpha.security.MmapWriteExec -analyzer-config alpha.security.MmapWriteExec:MmapProtExec=1 -analyzer-config alpha.security.MmapWriteExec:MmapProtRead=4 -DUSE_ALTERNATIVE_PROT_EXEC_DEFINITION -verify %s
+// RUN: %clang_analyze_cc1 -triple i686-unknown-linux -analyzer-checker=alpha.security.MmapWriteExec -DUSE_ALTERNATIVE_PROT_EXEC_DEFINITION -verify %s
 // RUN: %clang_analyze_cc1 -triple x86_64-unknown-apple-darwin10 -analyzer-checker=alpha.security.MmapWriteExec -verify %s
 
-#define PROT_WRITE  0x02
 #ifndef USE_ALTERNATIVE_PROT_EXEC_DEFINITION
-#define PROT_EXEC   0x04
-#define PROT_READ   0x01
-#else
 #define PROT_EXEC   0x01
+#define PROT_WRITE  0x02
 #define PROT_READ   0x04
+#else
+#define PROT_EXEC   0x08
+#define PROT_WRITE  0x04
+#define PROT_READ   0x02
 #endif
 #define MAP_PRIVATE 0x0002
 #define MAP_ANON    0x1000

>From f8ae128755777424cf4133e4e8e819c0bc08d2b1 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Fri, 26 Jul 2024 09:22:47 +0100
Subject: [PATCH 245/259] [clang][FMV][AArch64] Improve streaming mode
 compatibility. (#100181)

* Allow arm-streaming if all the functions versions adhere to it.
* Allow arm-streaming-compatible if all the functions versions adhere to
it.

When the caller needs to toggle the streaming mode all the function
versions of the callee must adhere to the same mode, otherwise the call
will yield a runtime error.

Imagine the versions of the callee live in separate TUs. The version
that is visible to the caller will determine the calling convention used
when generating code for the callsite. Therefore we cannot support
mixing streaming with non-streaming function versions. Imagine TU1 has a
streaming caller and calls foo._sme which is streaming-compatible. The
codegen for the callsite will not switch off the streaming mode. Then in
TU2 we have a version which is non-streaming and could potentially be
called in streaming mode. Similarly if the caller is non-streaming and
the called version is streaming-compatible the codegen for the callsite
will not switch on the streaming mode, but other versions may be
streaming.
---
 .../clang/Basic/DiagnosticSemaKinds.td        |   2 -
 clang/lib/Sema/SemaDecl.cpp                   |  24 +++-
 clang/lib/Sema/SemaDeclAttr.cpp               |   7 --
 clang/test/CodeGen/aarch64-fmv-streaming.c    | 107 ++++++++++++++++++
 clang/test/Sema/aarch64-fmv-streaming.c       |  46 ++++++++
 clang/test/Sema/aarch64-sme-func-attrs.c      |  42 -------
 6 files changed, 173 insertions(+), 55 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-fmv-streaming.c
 create mode 100644 clang/test/Sema/aarch64-fmv-streaming.c

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 810abe4f23e31..beee2432fdb06 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3812,8 +3812,6 @@ def warn_sme_locally_streaming_has_vl_args_returns : Warning<
   InGroup<AArch64SMEAttributes>, DefaultIgnore;
 def err_conflicting_attributes_arm_state : Error<
   "conflicting attributes for state '%0'">;
-def err_sme_streaming_cannot_be_multiversioned : Error<
-  "streaming function cannot be multi-versioned">;
 def err_unknown_arm_state : Error<
   "unknown state '%0'">;
 def err_missing_arm_state : Error<
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 575bd292f27de..7bdecb2546f30 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -11009,6 +11009,9 @@ static bool AttrCompatibleWithMultiVersion(attr::Kind Kind,
   switch (Kind) {
   default:
     return false;
+  case attr::ArmLocallyStreaming:
+    return MVKind == MultiVersionKind::TargetVersion ||
+           MVKind == MultiVersionKind::TargetClones;
   case attr::Used:
     return MVKind == MultiVersionKind::Target;
   case attr::NonNull:
@@ -11145,7 +11148,21 @@ bool Sema::areMultiversionVariantFunctionsCompatible(
     FunctionType::ExtInfo OldTypeInfo = OldType->getExtInfo();
     FunctionType::ExtInfo NewTypeInfo = NewType->getExtInfo();
 
-    if (OldTypeInfo.getCC() != NewTypeInfo.getCC())
+    const auto *OldFPT = OldFD->getType()->getAs<FunctionProtoType>();
+    const auto *NewFPT = NewFD->getType()->getAs<FunctionProtoType>();
+
+    bool ArmStreamingCCMismatched = false;
+    if (OldFPT && NewFPT) {
+      unsigned Diff =
+          OldFPT->getAArch64SMEAttributes() ^ NewFPT->getAArch64SMEAttributes();
+      // Arm-streaming, arm-streaming-compatible and non-streaming versions
+      // cannot be mixed.
+      if (Diff & (FunctionType::SME_PStateSMEnabledMask |
+                  FunctionType::SME_PStateSMCompatibleMask))
+        ArmStreamingCCMismatched = true;
+    }
+
+    if (OldTypeInfo.getCC() != NewTypeInfo.getCC() || ArmStreamingCCMismatched)
       return Diag(DiffDiagIDAt.first, DiffDiagIDAt.second) << CallingConv;
 
     QualType OldReturnType = OldType->getReturnType();
@@ -11165,9 +11182,8 @@ bool Sema::areMultiversionVariantFunctionsCompatible(
     if (!CLinkageMayDiffer && OldFD->isExternC() != NewFD->isExternC())
       return Diag(DiffDiagIDAt.first, DiffDiagIDAt.second) << LanguageLinkage;
 
-    if (CheckEquivalentExceptionSpec(
-            OldFD->getType()->getAs<FunctionProtoType>(), OldFD->getLocation(),
-            NewFD->getType()->getAs<FunctionProtoType>(), NewFD->getLocation()))
+    if (CheckEquivalentExceptionSpec(OldFPT, OldFD->getLocation(), NewFPT,
+                                     NewFD->getLocation()))
       return true;
   }
   return false;
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 300bd8967790f..8b0a6bf4caf48 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3034,9 +3034,6 @@ bool Sema::checkTargetVersionAttr(SourceLocation LiteralLoc, Decl *D,
       return Diag(LiteralLoc, diag::warn_unsupported_target_attribute)
              << Unsupported << None << CurFeature << TargetVersion;
   }
-  if (IsArmStreamingFunction(cast<FunctionDecl>(D),
-                             /*IncludeLocallyStreaming=*/false))
-    return Diag(LiteralLoc, diag::err_sme_streaming_cannot_be_multiversioned);
   return false;
 }
 
@@ -3133,10 +3130,6 @@ bool Sema::checkTargetClonesAttrString(
           HasNotDefault = true;
         }
       }
-      if (IsArmStreamingFunction(cast<FunctionDecl>(D),
-                                 /*IncludeLocallyStreaming=*/false))
-        return Diag(LiteralLoc,
-                    diag::err_sme_streaming_cannot_be_multiversioned);
     } else {
       // Other targets ( currently X86 )
       if (Cur.starts_with("arch=")) {
diff --git a/clang/test/CodeGen/aarch64-fmv-streaming.c b/clang/test/CodeGen/aarch64-fmv-streaming.c
new file mode 100644
index 0000000000000..e549ccda59ad8
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-fmv-streaming.c
@@ -0,0 +1,107 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -emit-llvm -o - %s | FileCheck %s
+
+
+// CHECK-LABEL: define {{[^@]+}}@n_callee._Msve
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+//
+// CHECK-LABEL: define {{[^@]+}}@n_callee._Msimd
+// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+//
+__arm_locally_streaming __attribute__((target_clones("sve", "simd"))) void n_callee(void) {}
+// CHECK-LABEL: define {{[^@]+}}@n_callee._Msme2
+// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+//
+__attribute__((target_version("sme2"))) void n_callee(void) {}
+// CHECK-LABEL: define {{[^@]+}}@n_callee.default
+// CHECK-SAME: () #[[ATTR3:[0-9]+]] {
+//
+__attribute__((target_version("default"))) void n_callee(void) {}
+
+
+// CHECK-LABEL: define {{[^@]+}}@s_callee._Msve
+// CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+//
+// CHECK-LABEL: define {{[^@]+}}@s_callee._Msimd
+// CHECK-SAME: () #[[ATTR5:[0-9]+]] {
+//
+__attribute__((target_clones("sve", "simd"))) void s_callee(void) __arm_streaming {}
+// CHECK-LABEL: define {{[^@]+}}@s_callee._Msme2
+// CHECK-SAME: () #[[ATTR6:[0-9]+]] {
+//
+__arm_locally_streaming __attribute__((target_version("sme2"))) void s_callee(void) __arm_streaming {}
+// CHECK-LABEL: define {{[^@]+}}@s_callee.default
+// CHECK-SAME: () #[[ATTR7:[0-9]+]] {
+//
+__attribute__((target_version("default"))) void s_callee(void) __arm_streaming {}
+
+
+// CHECK-LABEL: define {{[^@]+}}@sc_callee._Msve
+// CHECK-SAME: () #[[ATTR8:[0-9]+]] {
+//
+// CHECK-LABEL: define {{[^@]+}}@sc_callee._Msimd
+// CHECK-SAME: () #[[ATTR9:[0-9]+]] {
+//
+__attribute__((target_clones("sve", "simd"))) void sc_callee(void) __arm_streaming_compatible {}
+// CHECK-LABEL: define {{[^@]+}}@sc_callee._Msme2
+// CHECK-SAME: () #[[ATTR10:[0-9]+]] {
+//
+__arm_locally_streaming __attribute__((target_version("sme2"))) void sc_callee(void) __arm_streaming_compatible {}
+// CHECK-LABEL: define {{[^@]+}}@sc_callee.default
+// CHECK-SAME: () #[[ATTR11:[0-9]+]] {
+//
+__attribute__((target_version("default"))) void sc_callee(void) __arm_streaming_compatible {}
+
+
+// CHECK-LABEL: define {{[^@]+}}@n_caller
+// CHECK-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK:    call void @n_callee()
+// CHECK:    call void @s_callee() #[[ATTR12:[0-9]+]]
+// CHECK:    call void @sc_callee() #[[ATTR13:[0-9]+]]
+//
+void n_caller(void) {
+  n_callee();
+  s_callee();
+  sc_callee();
+}
+
+
+// CHECK-LABEL: define {{[^@]+}}@s_caller
+// CHECK-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK:    call void @n_callee()
+// CHECK:    call void @s_callee() #[[ATTR12]]
+// CHECK:    call void @sc_callee() #[[ATTR13]]
+//
+void s_caller(void) __arm_streaming {
+  n_callee();
+  s_callee();
+  sc_callee();
+}
+
+
+// CHECK-LABEL: define {{[^@]+}}@sc_caller
+// CHECK-SAME: () #[[ATTR11:[0-9]+]] {
+// CHECK:    call void @n_callee()
+// CHECK:    call void @s_callee() #[[ATTR12]]
+// CHECK:    call void @sc_callee() #[[ATTR13]]
+//
+void sc_caller(void) __arm_streaming_compatible {
+  n_callee();
+  s_callee();
+  sc_callee();
+}
+
+
+// CHECK: attributes #[[ATTR0:[0-9]+]] = {{.*}} "aarch64_pstate_sm_body"
+// CHECK: attributes #[[ATTR1:[0-9]+]] = {{.*}} "aarch64_pstate_sm_body"
+// CHECK: attributes #[[ATTR2:[0-9]+]] = {{.*}}
+// CHECK: attributes #[[ATTR3]] = {{.*}}
+// CHECK: attributes #[[ATTR4:[0-9]+]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR5:[0-9]+]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR6:[0-9]+]] = {{.*}} "aarch64_pstate_sm_body" "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR7]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR8:[0-9]+]] = {{.*}} "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[ATTR9:[0-9]+]] = {{.*}} "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[ATTR10]] = {{.*}} "aarch64_pstate_sm_body" "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[ATTR11]] = {{.*}} "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[ATTR12]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[ATTR13]] = {{.*}} "aarch64_pstate_sm_compatible"
diff --git a/clang/test/Sema/aarch64-fmv-streaming.c b/clang/test/Sema/aarch64-fmv-streaming.c
new file mode 100644
index 0000000000000..93b7656216c0c
--- /dev/null
+++ b/clang/test/Sema/aarch64-fmv-streaming.c
@@ -0,0 +1,46 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -Waarch64-sme-attributes -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -Waarch64-sme-attributes -fsyntax-only -verify=expected-cpp -x c++ %s
+
+__attribute__((target_clones("sve", "simd"))) void ok_arm_streaming(void) __arm_streaming {}
+__arm_locally_streaming __attribute__((target_version("sme2"))) void ok_arm_streaming(void) __arm_streaming {}
+__attribute__((target_version("default"))) void ok_arm_streaming(void) __arm_streaming {}
+
+__attribute__((target_clones("sve", "simd"))) void ok_arm_streaming_compatible(void) __arm_streaming_compatible {}
+__arm_locally_streaming __attribute__((target_version("sme2"))) void ok_arm_streaming_compatible(void) __arm_streaming_compatible {}
+__attribute__((target_version("default"))) void ok_arm_streaming_compatible(void) __arm_streaming_compatible {}
+
+__arm_locally_streaming __attribute__((target_clones("sve", "simd"))) void ok_no_streaming(void) {}
+__attribute__((target_version("sme2"))) void ok_no_streaming(void) {}
+__attribute__((target_version("default"))) void ok_no_streaming(void) {}
+
+__attribute__((target_clones("sve", "simd"))) void bad_mixed_streaming(void) {}
+// expected-cpp-error at +2 {{multiversioned function declaration has a different calling convention}}
+// expected-error at +1 {{multiversioned function declaration has a different calling convention}}
+__attribute__((target_version("sme2"))) void bad_mixed_streaming(void) __arm_streaming {}
+// expected-cpp-error at +2 {{multiversioned function declaration has a different calling convention}}
+// expected-error at +1 {{multiversioned function declaration has a different calling convention}}
+__attribute__((target_version("default"))) void bad_mixed_streaming(void) __arm_streaming_compatible {}
+// expected-cpp-error at +2 {{multiversioned function declaration has a different calling convention}}
+// expected-error at +1 {{multiversioned function declaration has a different calling convention}}
+__arm_locally_streaming __attribute__((target_version("dotprod"))) void bad_mixed_streaming(void) __arm_streaming {}
+
+void n_caller(void) {
+  ok_arm_streaming();
+  ok_arm_streaming_compatible();
+  ok_no_streaming();
+  bad_mixed_streaming();
+}
+
+void s_caller(void) __arm_streaming {
+  ok_arm_streaming();
+  ok_arm_streaming_compatible();
+  ok_no_streaming();
+  bad_mixed_streaming();
+}
+
+void sc_caller(void) __arm_streaming_compatible {
+  ok_arm_streaming();
+  ok_arm_streaming_compatible();
+  ok_no_streaming();
+  bad_mixed_streaming();
+}
diff --git a/clang/test/Sema/aarch64-sme-func-attrs.c b/clang/test/Sema/aarch64-sme-func-attrs.c
index 6db39d6a71e36..0c263eb2610cf 100644
--- a/clang/test/Sema/aarch64-sme-func-attrs.c
+++ b/clang/test/Sema/aarch64-sme-func-attrs.c
@@ -455,48 +455,6 @@ void unimplemented_spill_fill_za(void (*share_zt0_only)(void) __arm_inout("zt0")
   share_zt0_only();
 }
 
-// expected-cpp-error at +2 {{streaming function cannot be multi-versioned}}
-// expected-error at +1 {{streaming function cannot be multi-versioned}}
-__attribute__((target_version("sme2")))
-void cannot_work_version(void) __arm_streaming {}
-// expected-cpp-error at +5 {{function declared 'void ()' was previously declared 'void () __arm_streaming', which has different SME function attributes}}
-// expected-cpp-note at -2 {{previous declaration is here}}
-// expected-error at +3 {{function declared 'void (void)' was previously declared 'void (void) __arm_streaming', which has different SME function attributes}}
-// expected-note at -4 {{previous declaration is here}}
-__attribute__((target_version("default")))
-void cannot_work_version(void) {}
-
-
-// expected-cpp-error at +2 {{streaming function cannot be multi-versioned}}
-// expected-error at +1 {{streaming function cannot be multi-versioned}}
-__attribute__((target_clones("sme2")))
-void cannot_work_clones(void) __arm_streaming {}
-
-
-__attribute__((target("sme2")))
-void just_fine_streaming(void) __arm_streaming {}
-__attribute__((target_version("sme2")))
-void just_fine(void) { just_fine_streaming(); }
-__attribute__((target_version("default")))
-void just_fine(void) {}
-
-
-__arm_locally_streaming
-__attribute__((target_version("sme2")))
-void incompatible_locally_streaming(void) {}
-// expected-error at -1 {{attribute 'target_version' multiversioning cannot be combined with attribute '__arm_locally_streaming'}}
-// expected-cpp-error at -2 {{attribute 'target_version' multiversioning cannot be combined with attribute '__arm_locally_streaming'}}
-__attribute__((target_version("default")))
-void incompatible_locally_streaming(void) {}
-
-
-void fmv_caller() {
-    cannot_work_version();
-    cannot_work_clones();
-    just_fine();
-    incompatible_locally_streaming();
-}
-
 void sme_streaming_with_vl_arg(__SVInt8_t a) __arm_streaming { }
 
 __SVInt8_t sme_streaming_returns_vl(void) __arm_streaming { __SVInt8_t r; return r; }

>From 8d937c60a761def562bcf4cccbaea58176ee47da Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <gribozavr at gmail.com>
Date: Fri, 26 Jul 2024 10:22:13 +0200
Subject: [PATCH 246/259] [bindings][ocaml] Remove a reference to the deleted
 function `llvm_x86_mmx_type`

The function was removed in dfeb3991fb489a703f631ab0c34b58f80568038d.

This change fixes the build for OCaml bindings and tests.
---
 llvm/bindings/ocaml/llvm/llvm.ml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/bindings/ocaml/llvm/llvm.ml b/llvm/bindings/ocaml/llvm/llvm.ml
index 86b010e0ac22d..908e6658a89f7 100644
--- a/llvm/bindings/ocaml/llvm/llvm.ml
+++ b/llvm/bindings/ocaml/llvm/llvm.ml
@@ -519,7 +519,6 @@ external vector_size : lltype -> int = "llvm_vector_size"
 (*--... Operations on other types ..........................................--*)
 external void_type : llcontext -> lltype = "llvm_void_type"
 external label_type : llcontext -> lltype = "llvm_label_type"
-external x86_mmx_type : llcontext -> lltype = "llvm_x86_mmx_type"
 external type_by_name : llmodule -> string -> lltype option = "llvm_type_by_name"
 
 external classify_value : llvalue -> ValueKind.t = "llvm_classify_value"

>From 3834523f77c0709ef873a3b1fb878d096687987a Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 26 Jul 2024 16:50:21 +0800
Subject: [PATCH 247/259] [LV][EVL] Refine the constructors of EVL recipe to
 use call by reference. NFC (#100088)

---
 llvm/lib/Transforms/Vectorize/VPlan.h         | 27 +++++++++----------
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  6 ++---
 .../Transforms/Vectorize/VPlanTest.cpp        |  4 +--
 3 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e1d18d5d6ff79..c9da5e5d38a6b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2246,12 +2246,12 @@ class VPReductionRecipe : public VPSingleDefRecipe {
 /// The Operands are {ChainOp, VecOp, EVL, [Condition]}.
 class VPReductionEVLRecipe : public VPReductionRecipe {
 public:
-  VPReductionEVLRecipe(VPReductionRecipe *R, VPValue *EVL, VPValue *CondOp)
+  VPReductionEVLRecipe(VPReductionRecipe &R, VPValue &EVL, VPValue *CondOp)
       : VPReductionRecipe(
-            VPDef::VPReductionEVLSC, R->getRecurrenceDescriptor(),
-            cast_or_null<Instruction>(R->getUnderlyingValue()),
-            ArrayRef<VPValue *>({R->getChainOp(), R->getVecOp(), EVL}), CondOp,
-            R->isOrdered()) {}
+            VPDef::VPReductionEVLSC, R.getRecurrenceDescriptor(),
+            cast_or_null<Instruction>(R.getUnderlyingValue()),
+            ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp,
+            R.isOrdered()) {}
 
   ~VPReductionEVLRecipe() override = default;
 
@@ -2558,10 +2558,10 @@ struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
 /// using the address to load from, the explicit vector length and an optional
 /// mask.
 struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
-  VPWidenLoadEVLRecipe(VPWidenLoadRecipe *L, VPValue *EVL, VPValue *Mask)
-      : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L->getIngredient(),
-                            {L->getAddr(), EVL}, L->isConsecutive(),
-                            L->isReverse(), L->getDebugLoc()),
+  VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask)
+      : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
+                            {L.getAddr(), &EVL}, L.isConsecutive(),
+                            L.isReverse(), L.getDebugLoc()),
         VPValue(this, &getIngredient()) {
     setMask(Mask);
   }
@@ -2634,11 +2634,10 @@ struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
 /// using the value to store, the address to store to, the explicit vector
 /// length and an optional mask.
 struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
-  VPWidenStoreEVLRecipe(VPWidenStoreRecipe *S, VPValue *EVL, VPValue *Mask)
-      : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S->getIngredient(),
-                            {S->getAddr(), S->getStoredValue(), EVL},
-                            S->isConsecutive(), S->isReverse(),
-                            S->getDebugLoc()) {
+  VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask)
+      : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
+                            {S.getAddr(), S.getStoredValue(), &EVL},
+                            S.isConsecutive(), S.isReverse(), S.getDebugLoc()) {
     setMask(Mask);
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c91fd0f118e31..045f6c356669f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1488,13 +1488,13 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
       if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(CurRecipe)) {
         VPValue *NewMask = GetNewMask(MemR->getMask());
         if (auto *L = dyn_cast<VPWidenLoadRecipe>(MemR))
-          NewRecipe = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask);
+          NewRecipe = new VPWidenLoadEVLRecipe(*L, *VPEVL, NewMask);
         else if (auto *S = dyn_cast<VPWidenStoreRecipe>(MemR))
-          NewRecipe = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask);
+          NewRecipe = new VPWidenStoreEVLRecipe(*S, *VPEVL, NewMask);
         else
           llvm_unreachable("unsupported recipe");
       } else if (auto *RedR = dyn_cast<VPReductionRecipe>(CurRecipe)) {
-        NewRecipe = new VPReductionEVLRecipe(RedR, VPEVL,
+        NewRecipe = new VPReductionEVLRecipe(*RedR, *VPEVL,
                                              GetNewMask(RedR->getCondOp()));
       }
 
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index d9d6789134d88..9cf9060458bc9 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1140,7 +1140,7 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp,
                              &VecOp, false);
     VPValue EVL;
-    VPReductionEVLRecipe EVLRecipe(&Recipe, &EVL, &CondOp);
+    VPReductionEVLRecipe EVLRecipe(Recipe, EVL, &CondOp);
     EXPECT_FALSE(EVLRecipe.mayHaveSideEffects());
     EXPECT_FALSE(EVLRecipe.mayReadFromMemory());
     EXPECT_FALSE(EVLRecipe.mayWriteToMemory());
@@ -1495,7 +1495,7 @@ TEST(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) {
   VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, &ChainOp, &CondOp,
                            &VecOp, false);
   VPValue EVL;
-  VPReductionEVLRecipe EVLRecipe(&Recipe, &EVL, &CondOp);
+  VPReductionEVLRecipe EVLRecipe(Recipe, EVL, &CondOp);
   EXPECT_TRUE(isa<VPUser>(&EVLRecipe));
   VPRecipeBase *BaseR = &EVLRecipe;
   EXPECT_TRUE(isa<VPUser>(BaseR));

>From 6f83a031e452bb33c0ee23b8c5c4dee97ce2bf52 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 26 Jul 2024 13:02:12 +0400
Subject: [PATCH 248/259] CodeGen: Move current call site out of
 MachineModuleInfo (#100369)

I do not know understand what this is for, but it's only used in
SelectionDAGBuilder, so move it to FunctionLoweringInfo like other
function scope DAG builder state. The intrinsics are not documented
in the LangRef or Intrinsics.td.

This removes the last piece of codegen state from MachineModuleInfo.
---
 .../llvm/CodeGen/FunctionLoweringInfo.h       |  9 +++++++
 llvm/include/llvm/CodeGen/MachineModuleInfo.h | 24 -------------------
 llvm/lib/CodeGen/MachineFunction.cpp          |  5 ++++
 llvm/lib/CodeGen/MachineModuleInfo.cpp        |  2 --
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 10 ++++----
 5 files changed, 18 insertions(+), 32 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
index 45a47d7333e35..04667c04a4ef4 100644
--- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -188,6 +188,9 @@ class FunctionLoweringInfo {
   /// SelectionDAGISel::PrepareEHLandingPad().
   unsigned ExceptionPointerVirtReg, ExceptionSelectorVirtReg;
 
+  /// The current call site index being processed, if any. 0 if none.
+  unsigned CurCallSite = 0;
+
   /// Collection of dbg.declare instructions handled after argument
   /// lowering and before ISel proper.
   SmallPtrSet<const DbgDeclareInst *, 8> PreprocessedDbgDeclares;
@@ -281,6 +284,12 @@ class FunctionLoweringInfo {
   Register getCatchPadExceptionPointerVReg(const Value *CPI,
                                            const TargetRegisterClass *RC);
 
+  /// Set the call site currently being processed.
+  void setCurrentCallSite(unsigned Site) { CurCallSite = Site; }
+
+  /// Get the call site currently being processed, if any. Return zero if none.
+  unsigned getCurrentCallSite() { return CurCallSite; }
+
 private:
   /// LiveOutRegInfo - Information about live out vregs.
   IndexedMap<LiveOutInfo, VirtReg2IndexFunctor> LiveOutRegInfo;
diff --git a/llvm/include/llvm/CodeGen/MachineModuleInfo.h b/llvm/include/llvm/CodeGen/MachineModuleInfo.h
index dfa0e993ec06a..f054c56bb641c 100644
--- a/llvm/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineModuleInfo.h
@@ -99,20 +99,6 @@ class MachineModuleInfo {
   /// want.
   MachineModuleInfoImpl *ObjFileMMI;
 
-  /// \name Exception Handling
-  /// \{
-
-  /// The current call site index being processed, if any. 0 if none.
-  unsigned CurCallSite = 0;
-
-  /// \}
-
-  // TODO: Ideally, what we'd like is to have a switch that allows emitting
-  // synchronous (precise at call-sites only) CFA into .eh_frame. However,
-  // even under this switch, we'd like .debug_frame to be precise when using
-  // -g. At this moment, there's no way to specify that some CFI directives
-  // go into .eh_frame only, while others go into .debug_frame only.
-
   /// True if debugging information is available in this module.
   bool DbgInfoAvailable = false;
 
@@ -185,16 +171,6 @@ class MachineModuleInfo {
   /// Returns true if valid debug info is present.
   bool hasDebugInfo() const { return DbgInfoAvailable; }
 
-  /// \name Exception Handling
-  /// \{
-
-  /// Set the call site currently being processed.
-  void setCurrentCallSite(unsigned Site) { CurCallSite = Site; }
-
-  /// Get the call site currently being processed, if any.  return zero if
-  /// none.
-  unsigned getCurrentCallSite() { return CurCallSite; }
-
   /// \}
 }; // End class MachineModuleInfo
 
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 7f6a75208d253..61799251095ae 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -654,6 +654,11 @@ void MachineFunction::print(raw_ostream &OS, const SlotIndexes *Indexes) const {
 
 /// True if this function needs frame moves for debug or exceptions.
 bool MachineFunction::needsFrameMoves() const {
+  // TODO: Ideally, what we'd like is to have a switch that allows emitting
+  // synchronous (precise at call-sites only) CFA into .eh_frame. However, even
+  // under this switch, we'd like .debug_frame to be precise when using -g. At
+  // this moment, there's no way to specify that some CFI directives go into
+  // .eh_frame only, while others go into .debug_frame only.
   return getMMI().hasDebugInfo() ||
          getTarget().Options.ForceDwarfFrameSection ||
          F.needsUnwindTableEntry();
diff --git a/llvm/lib/CodeGen/MachineModuleInfo.cpp b/llvm/lib/CodeGen/MachineModuleInfo.cpp
index 150ab363c8fcd..f382df1d2a6e0 100644
--- a/llvm/lib/CodeGen/MachineModuleInfo.cpp
+++ b/llvm/lib/CodeGen/MachineModuleInfo.cpp
@@ -26,7 +26,6 @@ MachineModuleInfoImpl::~MachineModuleInfoImpl() = default;
 
 void MachineModuleInfo::initialize() {
   ObjFileMMI = nullptr;
-  CurCallSite = 0;
   NextFnNum = 0;
   DbgInfoAvailable = false;
 }
@@ -46,7 +45,6 @@ MachineModuleInfo::MachineModuleInfo(MachineModuleInfo &&MMI)
       MachineFunctions(std::move(MMI.MachineFunctions)) {
   Context.setObjectFileInfo(TM.getObjFileLowering());
   ObjFileMMI = MMI.ObjFileMMI;
-  CurCallSite = MMI.CurCallSite;
   ExternalContext = MMI.ExternalContext;
   TheModule = MMI.TheModule;
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 1791f1b503379..c554c0f5b6fd7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6708,10 +6708,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   case Intrinsic::eh_sjlj_callsite: {
     ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(0));
-    assert(DAG.getMMI()->getCurrentCallSite() == 0 &&
-           "Overlapping call sites!");
+    assert(FuncInfo.getCurrentCallSite() == 0 && "Overlapping call sites!");
 
-    DAG.getMMI()->setCurrentCallSite(CI->getZExtValue());
+    FuncInfo.setCurrentCallSite(CI->getZExtValue());
     return;
   }
   case Intrinsic::eh_sjlj_functioncontext: {
@@ -8619,7 +8618,6 @@ SDValue SelectionDAGBuilder::lowerStartEH(SDValue Chain,
                                           const BasicBlock *EHPadBB,
                                           MCSymbol *&BeginLabel) {
   MachineFunction &MF = DAG.getMachineFunction();
-  MachineModuleInfo &MMI = MF.getMMI();
 
   // Insert a label before the invoke call to mark the try range.  This can be
   // used to detect deletion of the invoke via the MachineModuleInfo.
@@ -8627,13 +8625,13 @@ SDValue SelectionDAGBuilder::lowerStartEH(SDValue Chain,
 
   // For SjLj, keep track of which landing pads go with which invokes
   // so as to maintain the ordering of pads in the LSDA.
-  unsigned CallSiteIndex = MMI.getCurrentCallSite();
+  unsigned CallSiteIndex = FuncInfo.getCurrentCallSite();
   if (CallSiteIndex) {
     MF.setCallSiteBeginLabel(BeginLabel, CallSiteIndex);
     LPadToCallSiteMap[FuncInfo.MBBMap[EHPadBB]].push_back(CallSiteIndex);
 
     // Now that the call site is handled, stop tracking it.
-    MMI.setCurrentCallSite(0);
+    FuncInfo.setCurrentCallSite(0);
   }
 
   return DAG.getEHLabel(getCurSDLoc(), Chain, BeginLabel);

>From 4ce3993ee2b6ee883ef62100df68db9e10ef1dc9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 26 Jul 2024 13:06:27 +0400
Subject: [PATCH 249/259] DebugInfo: Avoid some MMI::hasDebugInfo checks
 (#100333)

I assume getSubprogram will do the correct thing in hasDebugInfo,
and this is redundant with the debug_compile_units distance check.
This is in preparation for removing the field.
---
 llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp | 9 +++------
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp       | 7 ++++---
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp        | 4 ++--
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 6c70c47de8822..ed99eb3c459e5 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -241,10 +241,7 @@ bool DebugHandlerBase::isUnsignedDIType(const DIType *Ty) {
          Ty->getTag() == dwarf::DW_TAG_unspecified_type;
 }
 
-static bool hasDebugInfo(const MachineModuleInfo *MMI,
-                         const MachineFunction *MF) {
-  if (!MMI->hasDebugInfo())
-    return false;
+static bool hasDebugInfo(const MachineFunction *MF) {
   auto *SP = MF->getFunction().getSubprogram();
   if (!SP)
     return false;
@@ -258,7 +255,7 @@ static bool hasDebugInfo(const MachineModuleInfo *MMI,
 void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
   PrevInstBB = nullptr;
 
-  if (!Asm || !hasDebugInfo(MMI, MF)) {
+  if (!Asm || !hasDebugInfo(MF)) {
     skippedNonDebugFunction();
     return;
   }
@@ -415,7 +412,7 @@ void DebugHandlerBase::endInstruction() {
 }
 
 void DebugHandlerBase::endFunction(const MachineFunction *MF) {
-  if (Asm && hasDebugInfo(MMI, MF))
+  if (Asm && hasDebugInfo(MF))
     endFunctionImpl(MF);
   DbgValues.clear();
   DbgLabels.clear();
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 5f1f315c5ab24..fbce7e92b7781 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -1148,14 +1148,15 @@ sortGlobalExprs(SmallVectorImpl<DwarfCompileUnit::GlobalExpr> &GVEs) {
 void DwarfDebug::beginModule(Module *M) {
   DebugHandlerBase::beginModule(M);
 
-  if (!Asm || !MMI->hasDebugInfo())
+  if (!Asm)
     return;
 
   unsigned NumDebugCUs = std::distance(M->debug_compile_units_begin(),
                                        M->debug_compile_units_end());
+  if (NumDebugCUs == 0)
+    return;
+
   assert(NumDebugCUs > 0 && "Asm unexpectedly initialized");
-  assert(MMI->hasDebugInfo() &&
-         "DebugInfoAvailabilty unexpectedly not initialized");
   SingleCU = NumDebugCUs == 1;
   DenseMap<DIGlobalVariable *, SmallVector<DwarfCompileUnit::GlobalExpr, 1>>
       GVMap;
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 0b654abd2814c..b4eba07afe7c5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -493,7 +493,7 @@ void NVPTXAsmPrinter::emitFunctionEntryLabel() {
   // Emit initial .loc debug directive for correct relocation symbol data.
   if (const DISubprogram *SP = MF->getFunction().getSubprogram()) {
     assert(SP->getUnit());
-    if (!SP->getUnit()->isDebugDirectivesOnly() && MMI && MMI->hasDebugInfo())
+    if (!SP->getUnit()->isDebugDirectivesOnly())
       emitInitialRawDwarfLocDirective(*MF);
   }
 }
@@ -912,7 +912,7 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O,
     if (HasFullDebugInfo)
       break;
   }
-  if (MMI && MMI->hasDebugInfo() && HasFullDebugInfo)
+  if (HasFullDebugInfo)
     O << ", debug";
 
   O << "\n";

>From 63e1647827f3427c5f3ad37461d84a63ba5fcdaf Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 26 Jul 2024 13:10:08 +0400
Subject: [PATCH 250/259] CodeGen: Remove MachineModuleInfo reference from
 MachineFunction (#100357)

This avoids another unserializable field. Move the DbgInfoAvailable
field into the AsmPrinter, which is only really a cache/convenience
bit for checking a direct IR module metadata check.
---
 llvm/include/llvm/CodeGen/AsmPrinter.h        |  6 ++++++
 llvm/include/llvm/CodeGen/MIRPrinter.h        |  1 +
 llvm/include/llvm/CodeGen/MachineFunction.h   | 19 ++++++++-----------
 llvm/include/llvm/CodeGen/MachineModuleInfo.h |  6 ------
 llvm/include/llvm/CodeGen/SelectionDAG.h      |  1 +
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |  1 +
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    | 18 ++++++++----------
 llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp |  4 ++--
 .../CodeGen/AsmPrinter/DebugHandlerBase.cpp   |  4 ++--
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp    |  2 +-
 llvm/lib/CodeGen/MachineFunction.cpp          | 12 ++++++------
 llvm/lib/CodeGen/MachineFunctionAnalysis.cpp  |  2 +-
 llvm/lib/CodeGen/MachineModuleInfo.cpp        |  5 +----
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp     |  4 +---
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp     | 12 ++++++------
 llvm/tools/llvm-reduce/ReducerWorkItem.cpp    |  2 +-
 .../CodeGen/AArch64SelectionDAGTest.cpp       |  4 ++--
 llvm/unittests/CodeGen/InstrRefLDVTest.cpp    |  2 +-
 llvm/unittests/CodeGen/MFCommon.inc           |  3 ++-
 .../SelectionDAGAddressAnalysisTest.cpp       |  2 +-
 .../CodeGen/SelectionDAGPatternMatchTest.cpp  |  2 +-
 .../AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp  |  3 ++-
 llvm/unittests/Target/AMDGPU/PALMetadata.cpp  |  2 +-
 .../Target/RISCV/RISCVInstrInfoTest.cpp       |  2 +-
 24 files changed, 57 insertions(+), 62 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index f57be39076a78..36d1b47973870 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -225,6 +225,9 @@ class AsmPrinter : public MachineFunctionPass {
   /// split stack prologue.
   bool HasNoSplitStack = false;
 
+  /// True if debugging information is available in this module.
+  bool DbgInfoAvailable = false;
+
 protected:
   explicit AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer);
 
@@ -430,6 +433,9 @@ class AsmPrinter : public MachineFunctionPass {
   /// Get the CFISection type for the module.
   CFISection getModuleCFISectionType() const { return ModuleCFISection; }
 
+  /// Returns true if valid debug info is present.
+  bool hasDebugInfo() const { return DbgInfoAvailable; }
+
   bool needsSEHMoves();
 
   /// Since emitting CFI unwind information is entangled with supporting the
diff --git a/llvm/include/llvm/CodeGen/MIRPrinter.h b/llvm/include/llvm/CodeGen/MIRPrinter.h
index e98ef72243b8d..85bd674c56a60 100644
--- a/llvm/include/llvm/CodeGen/MIRPrinter.h
+++ b/llvm/include/llvm/CodeGen/MIRPrinter.h
@@ -21,6 +21,7 @@ namespace llvm {
 
 class MachineBasicBlock;
 class MachineFunction;
+class MachineModuleInfo;
 class Module;
 template <typename T> class SmallVectorImpl;
 
diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index 6e7292abeddbb..e1d03fe6b92c1 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -59,7 +59,6 @@ class MachineConstantPool;
 class MachineFrameInfo;
 class MachineFunction;
 class MachineJumpTableInfo;
-class MachineModuleInfo;
 class MachineRegisterInfo;
 class MCContext;
 class MCInstrDesc;
@@ -260,7 +259,6 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
   const LLVMTargetMachine &Target;
   const TargetSubtargetInfo *STI;
   MCContext &Ctx;
-  MachineModuleInfo &MMI;
 
   // RegInfo - Information about each register in use in the function.
   MachineRegisterInfo *RegInfo;
@@ -395,15 +393,15 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
 
   /// \}
 
-  /// Clear all the members of this MachineFunction, but the ones used
-  /// to initialize again the MachineFunction.
-  /// More specifically, this deallocates all the dynamically allocated
-  /// objects and get rid of all the XXXInfo data structure, but keep
-  /// unchanged the references to Fn, Target, MMI, and FunctionNumber.
+  /// Clear all the members of this MachineFunction, but the ones used to
+  /// initialize again the MachineFunction.  More specifically, this deallocates
+  /// all the dynamically allocated objects and get rids of all the XXXInfo data
+  /// structure, but keeps unchanged the references to Fn, Target, and
+  /// FunctionNumber.
   void clear();
   /// Allocate and initialize the different members.
   /// In particular, the XXXInfo data structure.
-  /// \pre Fn, Target, MMI, and FunctionNumber are properly set.
+  /// \pre Fn, Target, and FunctionNumber are properly set.
   void init();
 
 public:
@@ -632,8 +630,8 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
   const static unsigned int DebugOperandMemNumber;
 
   MachineFunction(Function &F, const LLVMTargetMachine &Target,
-                  const TargetSubtargetInfo &STI, unsigned FunctionNum,
-                  MachineModuleInfo &MMI);
+                  const TargetSubtargetInfo &STI, MCContext &Ctx,
+                  unsigned FunctionNum);
   MachineFunction(const MachineFunction &) = delete;
   MachineFunction &operator=(const MachineFunction &) = delete;
   ~MachineFunction();
@@ -665,7 +663,6 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
 
   GISelChangeObserver *getObserver() const { return Observer; }
 
-  MachineModuleInfo &getMMI() const { return MMI; }
   MCContext &getContext() const { return Ctx; }
 
   /// Returns the Section this function belongs to.
diff --git a/llvm/include/llvm/CodeGen/MachineModuleInfo.h b/llvm/include/llvm/CodeGen/MachineModuleInfo.h
index f054c56bb641c..310cc4b2abb77 100644
--- a/llvm/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineModuleInfo.h
@@ -99,9 +99,6 @@ class MachineModuleInfo {
   /// want.
   MachineModuleInfoImpl *ObjFileMMI;
 
-  /// True if debugging information is available in this module.
-  bool DbgInfoAvailable = false;
-
   /// Maps IR Functions to their corresponding MachineFunctions.
   DenseMap<const Function*, std::unique_ptr<MachineFunction>> MachineFunctions;
   /// Next unique number available for a MachineFunction.
@@ -168,9 +165,6 @@ class MachineModuleInfo {
     return const_cast<MachineModuleInfo*>(this)->getObjFileInfo<Ty>();
   }
 
-  /// Returns true if valid debug info is present.
-  bool hasDebugInfo() const { return DbgInfoAvailable; }
-
   /// \}
 }; // End class MachineModuleInfo
 
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 24eab7b408675..6a80c8c7216f6 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -76,6 +76,7 @@ struct KnownBits;
 class LLVMContext;
 class MachineBasicBlock;
 class MachineConstantPoolValue;
+class MachineModuleInfo;
 class MCSymbol;
 class OptimizationRemarkEmitter;
 class ProfileSummaryInfo;
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 54f71c9d221ad..882cadea22369 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -46,6 +46,7 @@ class LiveIntervals;
 class LiveVariables;
 class MachineLoop;
 class MachineMemOperand;
+class MachineModuleInfo;
 class MachineRegisterInfo;
 class MCAsmInfo;
 class MCInst;
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 2297b27ffdc07..4e35664e7458d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -436,6 +436,7 @@ bool AsmPrinter::doInitialization(Module &M) {
   MMI = MMIWP ? &MMIWP->getMMI() : nullptr;
   HasSplitStack = false;
   HasNoSplitStack = false;
+  DbgInfoAvailable = !M.debug_compile_units().empty();
 
   AddrLabelSymbols = nullptr;
 
@@ -541,8 +542,7 @@ bool AsmPrinter::doInitialization(Module &M) {
     if (EmitCodeView && TM.getTargetTriple().isOSWindows())
       DebugHandlers.push_back(std::make_unique<CodeViewDebug>(this));
     if (!EmitCodeView || M.getDwarfVersion()) {
-      assert(MMI && "MMI could not be nullptr here!");
-      if (MMI->hasDebugInfo()) {
+      if (hasDebugInfo()) {
         DD = new DwarfDebug(this);
         DebugHandlers.push_back(std::unique_ptr<DwarfDebug>(DD));
       }
@@ -1277,8 +1277,7 @@ AsmPrinter::getFunctionCFISectionType(const Function &F) const {
   if (MAI->usesCFIWithoutEH() && F.hasUWTable())
     return CFISection::EH;
 
-  assert(MMI != nullptr && "Invalid machine module info");
-  if (MMI->hasDebugInfo() || TM.Options.ForceDwarfFrameSection)
+  if (hasDebugInfo() || TM.Options.ForceDwarfFrameSection)
     return CFISection::Debug;
 
   return CFISection::None;
@@ -1669,10 +1668,9 @@ void AsmPrinter::emitPCSections(const MachineFunction &MF) {
 }
 
 /// Returns true if function begin and end labels should be emitted.
-static bool needFuncLabels(const MachineFunction &MF,
-                           const MachineModuleInfo &MMI) {
-  if (!MF.getLandingPads().empty() || MF.hasEHFunclets() ||
-      MMI.hasDebugInfo() ||
+static bool needFuncLabels(const MachineFunction &MF, const AsmPrinter &Asm) {
+  if (Asm.hasDebugInfo() || !MF.getLandingPads().empty() ||
+      MF.hasEHFunclets() ||
       MF.getFunction().hasMetadata(LLVMContext::MD_pcsections))
     return true;
 
@@ -1944,7 +1942,7 @@ void AsmPrinter::emitFunctionBody() {
   // are automatically sized.
   bool EmitFunctionSize = MAI->hasDotTypeDotSizeDirective() && !TT.isWasm();
 
-  if (needFuncLabels(*MF, *MMI) || EmitFunctionSize) {
+  if (EmitFunctionSize || needFuncLabels(*MF, *this)) {
     // Create a symbol for the end of function.
     CurrentFnEnd = createTempSymbol("func_end");
     OutStreamer->emitLabel(CurrentFnEnd);
@@ -2588,7 +2586,7 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   if (F.hasFnAttribute("patchable-function-entry") ||
       F.hasFnAttribute("function-instrument") ||
       F.hasFnAttribute("xray-instruction-threshold") ||
-      needFuncLabels(MF, *MMI) || NeedsLocalForSize ||
+      needFuncLabels(MF, *this) || NeedsLocalForSize ||
       MF.getTarget().Options.EmitStackSizeSection ||
       MF.getTarget().Options.BBAddrMap || MF.hasBBLabels()) {
     CurrentFnBegin = createTempSymbol("func_begin");
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index dddc08b3bc016..7700ffd6da803 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -613,7 +613,7 @@ static SourceLanguage MapDWLangToCVLang(unsigned DWLang) {
 void CodeViewDebug::beginModule(Module *M) {
   // If module doesn't have named metadata anchors or COFF debug section
   // is not available, skip any debug info related stuff.
-  if (!MMI->hasDebugInfo() ||
+  if (!Asm->hasDebugInfo() ||
       !Asm->getObjFileLowering().getCOFFDebugSymbolsSection()) {
     Asm = nullptr;
     return;
@@ -636,7 +636,7 @@ void CodeViewDebug::beginModule(Module *M) {
 }
 
 void CodeViewDebug::endModule() {
-  if (!Asm || !MMI->hasDebugInfo())
+  if (!Asm || !Asm->hasDebugInfo())
     return;
 
   // The COFF .debug$S section consists of several subsections, each starting
diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index ed99eb3c459e5..de2263c57493b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -351,7 +351,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
 }
 
 void DebugHandlerBase::beginInstruction(const MachineInstr *MI) {
-  if (!Asm || !MMI->hasDebugInfo())
+  if (!Asm || !Asm->hasDebugInfo())
     return;
 
   assert(CurMI == nullptr);
@@ -377,7 +377,7 @@ void DebugHandlerBase::beginInstruction(const MachineInstr *MI) {
 }
 
 void DebugHandlerBase::endInstruction() {
-  if (!Asm || !MMI->hasDebugInfo())
+  if (!Asm || !Asm->hasDebugInfo())
     return;
 
   assert(CurMI != nullptr);
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index fbce7e92b7781..ac4d0f21de60b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -1434,7 +1434,7 @@ void DwarfDebug::endModule() {
 
   // If we aren't actually generating debug info (check beginModule -
   // conditionalized on the presence of the llvm.dbg.cu metadata node)
-  if (!Asm || !MMI->hasDebugInfo())
+  if (!Asm || !Asm->hasDebugInfo())
     return;
 
   // Finalize the debug info for the module.
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 61799251095ae..40bde200f37ac 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -162,9 +162,9 @@ static inline Align getFnStackAlignment(const TargetSubtargetInfo *STI,
 }
 
 MachineFunction::MachineFunction(Function &F, const LLVMTargetMachine &Target,
-                                 const TargetSubtargetInfo &STI,
-                                 unsigned FunctionNum, MachineModuleInfo &mmi)
-    : F(F), Target(Target), STI(&STI), Ctx(mmi.getContext()), MMI(mmi) {
+                                 const TargetSubtargetInfo &STI, MCContext &Ctx,
+                                 unsigned FunctionNum)
+    : F(F), Target(Target), STI(&STI), Ctx(Ctx) {
   FunctionNumber = FunctionNum;
   init();
 }
@@ -659,9 +659,9 @@ bool MachineFunction::needsFrameMoves() const {
   // under this switch, we'd like .debug_frame to be precise when using -g. At
   // this moment, there's no way to specify that some CFI directives go into
   // .eh_frame only, while others go into .debug_frame only.
-  return getMMI().hasDebugInfo() ||
-         getTarget().Options.ForceDwarfFrameSection ||
-         F.needsUnwindTableEntry();
+  return getTarget().Options.ForceDwarfFrameSection ||
+         F.needsUnwindTableEntry() ||
+         !F.getParent()->debug_compile_units().empty();
 }
 
 namespace llvm {
diff --git a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
index 24eb360723dad..e7a4d6d61e211 100644
--- a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
@@ -37,7 +37,7 @@ MachineFunctionAnalysis::run(Function &F, FunctionAnalysisManager &FAM) {
                   .getCachedResult<MachineModuleAnalysis>(*F.getParent())
                   ->getMMI();
   auto MF = std::make_unique<MachineFunction>(
-      F, *TM, STI, Context.generateMachineFunctionNum(F), MMI);
+      F, *TM, STI, MMI.getContext(), Context.generateMachineFunctionNum(F));
   MF->initTargetMachineFunctionInfo(STI);
 
   // MRI callback for target specific initializations.
diff --git a/llvm/lib/CodeGen/MachineModuleInfo.cpp b/llvm/lib/CodeGen/MachineModuleInfo.cpp
index f382df1d2a6e0..c66495969b4e6 100644
--- a/llvm/lib/CodeGen/MachineModuleInfo.cpp
+++ b/llvm/lib/CodeGen/MachineModuleInfo.cpp
@@ -27,7 +27,6 @@ MachineModuleInfoImpl::~MachineModuleInfoImpl() = default;
 void MachineModuleInfo::initialize() {
   ObjFileMMI = nullptr;
   NextFnNum = 0;
-  DbgInfoAvailable = false;
 }
 
 void MachineModuleInfo::finalize() {
@@ -87,7 +86,7 @@ MachineFunction &MachineModuleInfo::getOrCreateMachineFunction(Function &F) {
   if (I.second) {
     // No pre-existing machine function, create a new one.
     const TargetSubtargetInfo &STI = *TM.getSubtargetImpl(F);
-    MF = new MachineFunction(F, TM, STI, NextFnNum++, *this);
+    MF = new MachineFunction(F, TM, STI, getContext(), NextFnNum++);
     MF->initTargetMachineFunctionInfo(STI);
 
     // MRI callback for target specific initializations.
@@ -207,7 +206,6 @@ bool MachineModuleInfoWrapperPass::doInitialization(Module &M) {
         Ctx.diagnose(
             DiagnosticInfoSrcMgr(SMD, M.getName(), IsInlineAsm, LocCookie));
       });
-  MMI.DbgInfoAvailable = !M.debug_compile_units().empty();
   return false;
 }
 
@@ -232,6 +230,5 @@ MachineModuleAnalysis::run(Module &M, ModuleAnalysisManager &) {
         Ctx.diagnose(
             DiagnosticInfoSrcMgr(SMD, M.getName(), IsInlineAsm, LocCookie));
       });
-  MMI.DbgInfoAvailable = !M.debug_compile_units().empty();
   return Result(MMI);
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index b4eba07afe7c5..d7197a7923eaf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -928,8 +928,6 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O,
 }
 
 bool NVPTXAsmPrinter::doFinalization(Module &M) {
-  bool HasDebugInfo = MMI && MMI->hasDebugInfo();
-
   // If we did not emit any functions, then the global declarations have not
   // yet been emitted.
   if (!GlobalsEmitted) {
@@ -945,7 +943,7 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) {
   auto *TS =
       static_cast<NVPTXTargetStreamer *>(OutStreamer->getTargetStreamer());
   // Close the last emitted section
-  if (HasDebugInfo) {
+  if (hasDebugInfo()) {
     TS->closeLastSection();
     // Emit empty .debug_loc section for better support of the empty files.
     OutStreamer->emitRawText("\t.section\t.debug_loc\t{\t}");
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 0e2811f87c817..dcde86388dcd9 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -3137,11 +3137,11 @@ void PPCAIXAsmPrinter::emitInstruction(const MachineInstr *MI) {
       break;
     MCSymbol *TempSym = OutContext.createNamedTempSymbol();
     OutStreamer->emitLabel(TempSym);
-    OutStreamer->emitXCOFFExceptDirective(CurrentFnSym, TempSym,
-                 LangMO.getImm(), ReasonMO.getImm(),
-                 Subtarget->isPPC64() ? MI->getMF()->getInstructionCount() * 8 :
-                 MI->getMF()->getInstructionCount() * 4,
-		 MMI->hasDebugInfo());
+    OutStreamer->emitXCOFFExceptDirective(
+        CurrentFnSym, TempSym, LangMO.getImm(), ReasonMO.getImm(),
+        Subtarget->isPPC64() ? MI->getMF()->getInstructionCount() * 8
+                             : MI->getMF()->getInstructionCount() * 4,
+        hasDebugInfo());
     break;
   }
   case PPC::GETtlsMOD32AIX:
@@ -3199,7 +3199,7 @@ void PPCAIXAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
 bool PPCAIXAsmPrinter::doFinalization(Module &M) {
   // Do streamer related finalization for DWARF.
-  if (!MAI->usesDwarfFileAndLocDirectives() && MMI->hasDebugInfo())
+  if (!MAI->usesDwarfFileAndLocDirectives() && hasDebugInfo())
     OutStreamer->doFinalizationAtSectionEnd(
         OutStreamer->getContext().getObjectFileInfo()->getTextSection());
 
diff --git a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
index 629996d74194e..3d36d80f82315 100644
--- a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
+++ b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
@@ -232,7 +232,7 @@ static std::unique_ptr<MachineFunction> cloneMF(MachineFunction *SrcMF,
                                                 MachineModuleInfo &DestMMI) {
   auto DstMF = std::make_unique<MachineFunction>(
       SrcMF->getFunction(), SrcMF->getTarget(), SrcMF->getSubtarget(),
-      SrcMF->getFunctionNumber(), DestMMI);
+      SrcMF->getContext(), SrcMF->getFunctionNumber());
   DenseMap<MachineBasicBlock *, MachineBasicBlock *> Src2DstMBB;
 
   auto *SrcMRI = &SrcMF->getRegInfo();
diff --git a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
index 877793a28b769..79e27c7ced61f 100644
--- a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
+++ b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
@@ -61,8 +61,8 @@ class AArch64SelectionDAGTest : public testing::Test {
 
     MachineModuleInfo MMI(TM.get());
 
-    MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F), 0,
-                                      MMI);
+    MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F),
+                                           MMI.getContext(), 0);
 
     DAG = std::make_unique<SelectionDAG>(*TM, CodeGenOptLevel::None);
     if (!DAG)
diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
index 28cfb3046bd47..d5365d9c79492 100644
--- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
+++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
@@ -94,7 +94,7 @@ class InstrRefLDVTest : public testing::Test {
     const TargetSubtargetInfo &STI = *Machine->getSubtargetImpl(*F);
 
     MF = std::make_unique<MachineFunction>(*F, (LLVMTargetMachine &)*Machine,
-                                           STI, FunctionNum, *MMI);
+                                           STI, MMI->getContext(), FunctionNum);
 
     // Create metadata: CU, subprogram, some blocks and an inline function
     // scope.
diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc
index 1997e80522975..d464a16f636cf 100644
--- a/llvm/unittests/CodeGen/MFCommon.inc
+++ b/llvm/unittests/CodeGen/MFCommon.inc
@@ -135,6 +135,7 @@ std::unique_ptr<MachineFunction> createMachineFunction(LLVMContext &Ctx,
   MachineModuleInfo MMI(TM);
   const TargetSubtargetInfo &STI = *TM->getSubtargetImpl(*F);
 
-  return std::make_unique<MachineFunction>(*F, *TM, STI, FunctionNum, MMI);
+  return std::make_unique<MachineFunction>(*F, *TM, STI, MMI.getContext(),
+                                           FunctionNum);
 }
 
diff --git a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
index ef30bd491d352..c89e5a45ab014 100644
--- a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
@@ -72,7 +72,7 @@ class SelectionDAGAddressAnalysisTest : public testing::Test {
     MachineModuleInfo MMI(TM.get());
 
     MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F),
-                                           0, MMI);
+                                           MMI.getContext(), 0);
 
     DAG = std::make_unique<SelectionDAG>(*TM, CodeGenOptLevel::None);
     if (!DAG)
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index e318f467bbf27..eadb0e26ba3b2 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -71,7 +71,7 @@ class SelectionDAGPatternMatchTest : public testing::Test {
     MachineModuleInfo MMI(TM.get());
 
     MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F),
-                                           0, MMI);
+                                           MMI.getContext(), 0);
 
     DAG = std::make_unique<SelectionDAG>(*TM, CodeGenOptLevel::None);
     if (!DAG)
diff --git a/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp b/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp
index ece288aaf06a0..5ac4edae5f0df 100644
--- a/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp
+++ b/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp
@@ -29,7 +29,8 @@ TEST(AMDGPU, ExecMayBeModifiedBeforeAnyUse) {
   auto *F = Function::Create(Type, GlobalValue::ExternalLinkage, "Test", &Mod);
 
   MachineModuleInfo MMI(TM.get());
-  auto MF = std::make_unique<MachineFunction>(*F, *TM, ST, 42, MMI);
+  auto MF =
+      std::make_unique<MachineFunction>(*F, *TM, ST, MMI.getContext(), 42);
   auto *BB = MF->CreateMachineBasicBlock();
   MF->push_back(BB);
 
diff --git a/llvm/unittests/Target/AMDGPU/PALMetadata.cpp b/llvm/unittests/Target/AMDGPU/PALMetadata.cpp
index 7d4f3836088ab..86aa4753a2123 100644
--- a/llvm/unittests/Target/AMDGPU/PALMetadata.cpp
+++ b/llvm/unittests/Target/AMDGPU/PALMetadata.cpp
@@ -62,7 +62,7 @@ class PALMetadata : public testing::Test {
                                         TM->getTargetCPU(),
                                         TM->getTargetFeatureString(), *TM);
 
-    MF = std::make_unique<MachineFunction>(*F, *TM, *ST, 1, *MMI);
+    MF = std::make_unique<MachineFunction>(*F, *TM, *ST, MMI->getContext(), 1);
   }
 };
 
diff --git a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
index 62cfda435a047..fe711619c6320 100644
--- a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
+++ b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
@@ -63,7 +63,7 @@ class RISCVInstrInfoTest : public testing::TestWithParam<const char *> {
         TM->getTargetFeatureString(),
         TM->getTargetTriple().isArch64Bit() ? "lp64" : "ilp32", 0, 0, *TM);
 
-    MF = std::make_unique<MachineFunction>(*F, *TM, *ST, 42, *MMI);
+    MF = std::make_unique<MachineFunction>(*F, *TM, *ST, MMI->getContext(), 42);
   }
 };
 

>From 7a3bc44c89b675b5fda7e90cf59e74d54778917f Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev at amd.com>
Date: Fri, 26 Jul 2024 12:10:30 +0300
Subject: [PATCH 251/259] [AMDGPU][MC][NFCI] Eliminate printU4ImmDecOperand().
 (#100589)

This is hoped to make things a bit safer not masking the value to print
and to make the logic in printDPPCtrl() a bit more explicit.

Part of <https://github.com/llvm/llvm-project/issues/62629>.
---
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 31 ++++++-------------
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h   |  1 -
 2 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index fc7aba3483aad..c31f85dbea127 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -65,11 +65,6 @@ void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
     printU32ImmOperand(MI, OpNo, STI, O);
 }
 
-void AMDGPUInstPrinter::printU4ImmDecOperand(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) {
-  O << formatDec(MI->getOperand(OpNo).getImm() & 0xf);
-}
-
 void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
                                               raw_ostream &O) {
   O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
@@ -719,29 +714,25 @@ void AMDGPUInstPrinter::printDefaultVccOperand(bool FirstOperand,
 void AMDGPUInstPrinter::printWaitVDST(const MCInst *MI, unsigned OpNo,
                                       const MCSubtargetInfo &STI,
                                       raw_ostream &O) {
-  O << " wait_vdst:";
-  printU4ImmDecOperand(MI, OpNo, O);
+  O << " wait_vdst:" << formatDec(MI->getOperand(OpNo).getImm());
 }
 
 void AMDGPUInstPrinter::printWaitVAVDst(const MCInst *MI, unsigned OpNo,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
-  O << " wait_va_vdst:";
-  printU4ImmDecOperand(MI, OpNo, O);
+  O << " wait_va_vdst:" << formatDec(MI->getOperand(OpNo).getImm());
 }
 
 void AMDGPUInstPrinter::printWaitVMVSrc(const MCInst *MI, unsigned OpNo,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
-  O << " wait_vm_vsrc:";
-  printU4ImmDecOperand(MI, OpNo, O);
+  O << " wait_vm_vsrc:" << formatDec(MI->getOperand(OpNo).getImm());
 }
 
 void AMDGPUInstPrinter::printWaitEXP(const MCInst *MI, unsigned OpNo,
                                     const MCSubtargetInfo &STI,
                                     raw_ostream &O) {
-  O << " wait_exp:";
-  printU4ImmDecOperand(MI, OpNo, O);
+  O << " wait_exp:" << formatDec(MI->getOperand(OpNo).getImm());
 }
 
 bool AMDGPUInstPrinter::needsImpliedVcc(const MCInstrDesc &Desc,
@@ -1065,16 +1056,13 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
     O << formatDec((Imm & 0xc0) >> 6) << ']';
   } else if ((Imm >= DppCtrl::ROW_SHL_FIRST) &&
              (Imm <= DppCtrl::ROW_SHL_LAST)) {
-    O << "row_shl:";
-    printU4ImmDecOperand(MI, OpNo, O);
+    O << "row_shl:" << formatDec(Imm - DppCtrl::ROW_SHL0);
   } else if ((Imm >= DppCtrl::ROW_SHR_FIRST) &&
              (Imm <= DppCtrl::ROW_SHR_LAST)) {
-    O << "row_shr:";
-    printU4ImmDecOperand(MI, OpNo, O);
+    O << "row_shr:" << formatDec(Imm - DppCtrl::ROW_SHR0);
   } else if ((Imm >= DppCtrl::ROW_ROR_FIRST) &&
              (Imm <= DppCtrl::ROW_ROR_LAST)) {
-    O << "row_ror:";
-    printU4ImmDecOperand(MI, OpNo, O);
+    O << "row_ror:" << formatDec(Imm - DppCtrl::ROW_ROR0);
   } else if (Imm == DppCtrl::WAVE_SHL1) {
     if (AMDGPU::isGFX10Plus(STI)) {
       O << "/* wave_shl is not supported starting from GFX10 */";
@@ -1126,15 +1114,14 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
            "than GFX90A/GFX10 */";
       return;
     }
-    printU4ImmDecOperand(MI, OpNo, O);
+    O << formatDec(Imm - DppCtrl::ROW_SHARE_FIRST);
   } else if ((Imm >= DppCtrl::ROW_XMASK_FIRST) &&
              (Imm <= DppCtrl::ROW_XMASK_LAST)) {
     if (!AMDGPU::isGFX10Plus(STI)) {
       O << "/* row_xmask is not supported on ASICs earlier than GFX10 */";
       return;
     }
-    O << "row_xmask:";
-    printU4ImmDecOperand(MI, OpNo, O);
+    O << "row_xmask:" << formatDec(Imm - DppCtrl::ROW_XMASK_FIRST);
   } else {
     O << "/* Invalid dpp_ctrl value */";
   }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index c5fad384348df..4a39022aea7cf 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -38,7 +38,6 @@ class AMDGPUInstPrinter : public MCInstPrinter {
 private:
   void printU16ImmOperand(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
-  void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU32ImmOperand(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);

>From 25da8e5a9798a1c25526e4dc8db571fc25cca187 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Fri, 26 Jul 2024 10:12:25 +0100
Subject: [PATCH 252/259] Reapply "[LICM] Fold associative binary ops to
 promote code hoisting (#81608)" (#100377)

This reapplies a more strict version of
https://github.com/llvm/llvm-project/commit/f2ccf80136a01ca69f766becafb329db6c54c0c8.

Perform the transformation

  "(LV op C1) op C2" ==> "LV op (C1 op C2)"

where op is an associative binary op, LV is a loop variant, and C1 and
C2 are loop invariants, and hoist (C1 op C2) into the preheader.

For now this fold is restricted to ADDs.
---
 llvm/lib/Transforms/Scalar/LICM.cpp        |  70 +++++
 llvm/test/CodeGen/PowerPC/common-chain.ll  | 315 +++++++++++----------
 llvm/test/Transforms/LICM/hoist-binop.ll   | 226 +++++++++++++++
 llvm/test/Transforms/LICM/sink-foldable.ll |   5 +-
 4 files changed, 459 insertions(+), 157 deletions(-)
 create mode 100644 llvm/test/Transforms/LICM/hoist-binop.ll

diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 91ef2b4b7c183..329b3ef0c8e4b 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -113,6 +113,8 @@ STATISTIC(NumFPAssociationsHoisted, "Number of invariant FP expressions "
 STATISTIC(NumIntAssociationsHoisted,
           "Number of invariant int expressions "
           "reassociated and hoisted out of the loop");
+STATISTIC(NumBOAssociationsHoisted, "Number of invariant BinaryOp expressions "
+                                    "reassociated and hoisted out of the loop");
 
 /// Memory promotion is enabled by default.
 static cl::opt<bool>
@@ -2779,6 +2781,68 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
   return true;
 }
 
+/// Reassociate associative binary expressions of the form
+///
+/// 1. "(LV op C1) op C2" ==> "LV op (C1 op C2)"
+///
+/// where op is an associative binary op, LV is a loop variant, and C1 and C2
+/// are loop invariants that we want to hoist.
+///
+/// TODO: This can be extended to more cases such as
+/// 2. "C1 op (C2 op LV)" ==> "(C1 op C2) op LV"
+/// 3. "(C1 op LV) op C2" ==> "LV op (C1 op C2)" if op is commutative
+/// 4. "C1 op (LV op C2)" ==> "(C1 op C2) op LV" if op is commutative
+static bool hoistBOAssociation(Instruction &I, Loop &L,
+                               ICFLoopSafetyInfo &SafetyInfo,
+                               MemorySSAUpdater &MSSAU, AssumptionCache *AC,
+                               DominatorTree *DT) {
+  auto *BO = dyn_cast<BinaryOperator>(&I);
+  if (!BO || !BO->isAssociative())
+    return false;
+
+  // Only fold ADDs for now.
+  Instruction::BinaryOps Opcode = BO->getOpcode();
+  if (Opcode != Instruction::Add)
+    return false;
+
+  auto *BO0 = dyn_cast<BinaryOperator>(BO->getOperand(0));
+  if (!BO0 || BO0->getOpcode() != Opcode || !BO0->isAssociative())
+    return false;
+
+  // Transform: "(LV op C1) op C2" ==> "LV op (C1 op C2)"
+  Value *LV = BO0->getOperand(0);
+  Value *C1 = BO0->getOperand(1);
+  Value *C2 = BO->getOperand(1);
+
+  if (L.isLoopInvariant(LV) || !L.isLoopInvariant(C1) || !L.isLoopInvariant(C2))
+    return false;
+
+  auto *Preheader = L.getLoopPreheader();
+  assert(Preheader && "Loop is not in simplify form?");
+
+  auto *Inv = BinaryOperator::Create(Opcode, C1, C2, "invariant.op",
+                                     Preheader->getTerminator());
+  auto *NewBO =
+      BinaryOperator::Create(Opcode, LV, Inv, BO->getName() + ".reass", BO);
+
+  // Copy NUW for ADDs if both instructions have it.
+  if (Opcode == Instruction::Add && BO->hasNoUnsignedWrap() &&
+      BO0->hasNoUnsignedWrap()) {
+    Inv->setHasNoUnsignedWrap(true);
+    NewBO->setHasNoUnsignedWrap(true);
+  }
+
+  BO->replaceAllUsesWith(NewBO);
+  eraseInstruction(*BO, SafetyInfo, MSSAU);
+
+  // (LV op C1) might not be erased if it has more uses than the one we just
+  // replaced.
+  if (BO0->use_empty())
+    eraseInstruction(*BO0, SafetyInfo, MSSAU);
+
+  return true;
+}
+
 static bool hoistArithmetics(Instruction &I, Loop &L,
                              ICFLoopSafetyInfo &SafetyInfo,
                              MemorySSAUpdater &MSSAU, AssumptionCache *AC,
@@ -2816,6 +2880,12 @@ static bool hoistArithmetics(Instruction &I, Loop &L,
     return true;
   }
 
+  if (hoistBOAssociation(I, L, SafetyInfo, MSSAU, AC, DT)) {
+    ++NumHoisted;
+    ++NumBOAssociationsHoisted;
+    return true;
+  }
+
   return false;
 }
 
diff --git a/llvm/test/CodeGen/PowerPC/common-chain.ll b/llvm/test/CodeGen/PowerPC/common-chain.ll
index 5f8c21e30f8fd..ccf0e4520f468 100644
--- a/llvm/test/CodeGen/PowerPC/common-chain.ll
+++ b/llvm/test/CodeGen/PowerPC/common-chain.ll
@@ -642,8 +642,8 @@ define i64 @two_chain_two_bases_succ(ptr %p, i64 %offset, i64 %base1, i64 %base2
 ; CHECK-NEXT:    cmpdi r7, 0
 ; CHECK-NEXT:    ble cr0, .LBB6_4
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    add r6, r6, r4
 ; CHECK-NEXT:    add r5, r5, r4
+; CHECK-NEXT:    add r6, r6, r4
 ; CHECK-NEXT:    mtctr r7
 ; CHECK-NEXT:    sldi r4, r4, 1
 ; CHECK-NEXT:    add r5, r3, r5
@@ -743,214 +743,219 @@ define signext i32 @spill_reduce_succ(ptr %input1, ptr %input2, ptr %output, i64
 ; CHECK-NEXT:    std r9, -184(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r8, -176(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r7, -168(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    std r3, -160(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    std r4, -160(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    ble cr0, .LBB7_7
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    sldi r6, r6, 2
-; CHECK-NEXT:    li r7, 1
-; CHECK-NEXT:    mr r30, r10
-; CHECK-NEXT:    cmpdi r6, 1
-; CHECK-NEXT:    iselgt r7, r6, r7
-; CHECK-NEXT:    addi r8, r7, -1
-; CHECK-NEXT:    clrldi r6, r7, 63
-; CHECK-NEXT:    cmpldi r8, 3
+; CHECK-NEXT:    sldi r4, r6, 2
+; CHECK-NEXT:    li r6, 1
+; CHECK-NEXT:    mr r0, r10
+; CHECK-NEXT:    std r10, -192(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    cmpdi r4, 1
+; CHECK-NEXT:    iselgt r4, r4, r6
+; CHECK-NEXT:    addi r7, r4, -1
+; CHECK-NEXT:    clrldi r6, r4, 63
+; CHECK-NEXT:    cmpldi r7, 3
 ; CHECK-NEXT:    blt cr0, .LBB7_4
 ; CHECK-NEXT:  # %bb.2: # %for.body.preheader.new
-; CHECK-NEXT:    ld r14, -168(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    mulli r24, r30, 24
-; CHECK-NEXT:    ld r16, -184(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld r15, -176(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld r3, -160(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    rldicl r0, r7, 62, 2
-; CHECK-NEXT:    sldi r11, r30, 5
-; CHECK-NEXT:    sldi r19, r30, 4
-; CHECK-NEXT:    sldi r7, r14, 3
-; CHECK-NEXT:    add r14, r30, r14
-; CHECK-NEXT:    sldi r10, r16, 3
-; CHECK-NEXT:    sldi r12, r15, 3
-; CHECK-NEXT:    add r16, r30, r16
-; CHECK-NEXT:    add r15, r30, r15
-; CHECK-NEXT:    add r27, r11, r7
-; CHECK-NEXT:    add r22, r24, r7
-; CHECK-NEXT:    add r17, r19, r7
-; CHECK-NEXT:    sldi r2, r14, 3
-; CHECK-NEXT:    add r26, r24, r10
-; CHECK-NEXT:    add r25, r24, r12
-; CHECK-NEXT:    add r21, r19, r10
-; CHECK-NEXT:    add r20, r19, r12
-; CHECK-NEXT:    add r8, r11, r10
-; CHECK-NEXT:    sldi r16, r16, 3
-; CHECK-NEXT:    add r29, r5, r27
-; CHECK-NEXT:    add r28, r4, r27
-; CHECK-NEXT:    add r27, r3, r27
-; CHECK-NEXT:    add r24, r5, r22
-; CHECK-NEXT:    add r23, r4, r22
-; CHECK-NEXT:    add r22, r3, r22
-; CHECK-NEXT:    add r19, r5, r17
-; CHECK-NEXT:    add r18, r4, r17
-; CHECK-NEXT:    add r17, r3, r17
-; CHECK-NEXT:    add r14, r5, r2
-; CHECK-NEXT:    add r31, r4, r2
-; CHECK-NEXT:    add r2, r3, r2
-; CHECK-NEXT:    add r9, r5, r8
-; CHECK-NEXT:    add r8, r11, r12
+; CHECK-NEXT:    ld r0, -192(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld r30, -184(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld r8, -176(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    rldicl r7, r4, 62, 2
+; CHECK-NEXT:    ld r9, -168(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    add r11, r0, r30
+; CHECK-NEXT:    add r4, r0, r0
+; CHECK-NEXT:    mulli r23, r0, 24
+; CHECK-NEXT:    add r14, r0, r8
+; CHECK-NEXT:    sldi r12, r0, 5
+; CHECK-NEXT:    add r31, r0, r9
+; CHECK-NEXT:    sldi r9, r9, 3
+; CHECK-NEXT:    sldi r18, r0, 4
+; CHECK-NEXT:    sldi r8, r8, 3
+; CHECK-NEXT:    add r10, r4, r4
+; CHECK-NEXT:    sldi r4, r30, 3
+; CHECK-NEXT:    sldi r11, r11, 3
+; CHECK-NEXT:    add r26, r12, r9
+; CHECK-NEXT:    add r16, r18, r9
+; CHECK-NEXT:    add r29, r12, r8
+; CHECK-NEXT:    add r19, r18, r8
+; CHECK-NEXT:    add r30, r12, r4
+; CHECK-NEXT:    mr r20, r4
+; CHECK-NEXT:    std r4, -200(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    ld r4, -160(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    add r15, r5, r11
+; CHECK-NEXT:    sldi r11, r14, 3
+; CHECK-NEXT:    add r29, r5, r29
+; CHECK-NEXT:    add r28, r3, r26
+; CHECK-NEXT:    add r19, r5, r19
+; CHECK-NEXT:    add r21, r23, r9
+; CHECK-NEXT:    add r24, r23, r8
+; CHECK-NEXT:    add r14, r5, r11
+; CHECK-NEXT:    sldi r11, r31, 3
+; CHECK-NEXT:    add r25, r23, r20
+; CHECK-NEXT:    add r20, r18, r20
+; CHECK-NEXT:    add r30, r5, r30
+; CHECK-NEXT:    add r18, r3, r16
+; CHECK-NEXT:    add r24, r5, r24
+; CHECK-NEXT:    add r23, r3, r21
+; CHECK-NEXT:    add r27, r4, r26
+; CHECK-NEXT:    add r22, r4, r21
+; CHECK-NEXT:    add r17, r4, r16
+; CHECK-NEXT:    add r2, r4, r11
+; CHECK-NEXT:    rldicl r4, r7, 2, 1
+; CHECK-NEXT:    sub r7, r8, r9
+; CHECK-NEXT:    ld r8, -200(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    add r26, r5, r26
 ; CHECK-NEXT:    add r25, r5, r25
 ; CHECK-NEXT:    add r21, r5, r21
 ; CHECK-NEXT:    add r20, r5, r20
 ; CHECK-NEXT:    add r16, r5, r16
-; CHECK-NEXT:    add r8, r5, r8
-; CHECK-NEXT:    rldicl r3, r0, 2, 1
-; CHECK-NEXT:    addi r3, r3, -4
-; CHECK-NEXT:    sub r0, r12, r7
-; CHECK-NEXT:    sub r12, r10, r7
-; CHECK-NEXT:    li r7, 0
-; CHECK-NEXT:    mr r10, r30
-; CHECK-NEXT:    sldi r15, r15, 3
-; CHECK-NEXT:    add r15, r5, r15
-; CHECK-NEXT:    rldicl r3, r3, 62, 2
-; CHECK-NEXT:    addi r3, r3, 1
-; CHECK-NEXT:    mtctr r3
+; CHECK-NEXT:    add r31, r5, r11
+; CHECK-NEXT:    add r11, r3, r11
+; CHECK-NEXT:    addi r4, r4, -4
+; CHECK-NEXT:    rldicl r4, r4, 62, 2
+; CHECK-NEXT:    sub r8, r8, r9
+; CHECK-NEXT:    li r9, 0
+; CHECK-NEXT:    addi r4, r4, 1
+; CHECK-NEXT:    mtctr r4
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB7_3: # %for.body
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    lfd f0, 0(r2)
-; CHECK-NEXT:    lfd f1, 0(r31)
-; CHECK-NEXT:    add r3, r10, r30
-; CHECK-NEXT:    add r3, r3, r30
+; CHECK-NEXT:    lfd f0, 0(r11)
+; CHECK-NEXT:    lfd f1, 0(r2)
+; CHECK-NEXT:    add r0, r0, r10
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfd f1, 0(r14)
-; CHECK-NEXT:    add r3, r3, r30
-; CHECK-NEXT:    add r10, r3, r30
+; CHECK-NEXT:    lfd f1, 0(r31)
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfd f0, 0(r14)
-; CHECK-NEXT:    add r14, r14, r11
-; CHECK-NEXT:    lfdx f0, r2, r0
-; CHECK-NEXT:    lfdx f1, r31, r0
+; CHECK-NEXT:    stfd f0, 0(r31)
+; CHECK-NEXT:    add r31, r31, r12
+; CHECK-NEXT:    lfdx f0, r11, r7
+; CHECK-NEXT:    lfdx f1, r2, r7
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r15, r7
+; CHECK-NEXT:    lfdx f1, r14, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r15, r7
-; CHECK-NEXT:    lfdx f0, r2, r12
-; CHECK-NEXT:    lfdx f1, r31, r12
-; CHECK-NEXT:    add r2, r2, r11
-; CHECK-NEXT:    add r31, r31, r11
+; CHECK-NEXT:    stfdx f0, r14, r9
+; CHECK-NEXT:    lfdx f0, r11, r8
+; CHECK-NEXT:    lfdx f1, r2, r8
+; CHECK-NEXT:    add r11, r11, r12
+; CHECK-NEXT:    add r2, r2, r12
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r16, r7
+; CHECK-NEXT:    lfdx f1, r15, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r16, r7
-; CHECK-NEXT:    lfd f0, 0(r17)
-; CHECK-NEXT:    lfd f1, 0(r18)
+; CHECK-NEXT:    stfdx f0, r15, r9
+; CHECK-NEXT:    lfd f0, 0(r18)
+; CHECK-NEXT:    lfd f1, 0(r17)
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r19, r7
+; CHECK-NEXT:    lfdx f1, r16, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r19, r7
-; CHECK-NEXT:    lfdx f0, r17, r0
-; CHECK-NEXT:    lfdx f1, r18, r0
+; CHECK-NEXT:    stfdx f0, r16, r9
+; CHECK-NEXT:    lfdx f0, r18, r7
+; CHECK-NEXT:    lfdx f1, r17, r7
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r20, r7
+; CHECK-NEXT:    lfdx f1, r19, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r20, r7
-; CHECK-NEXT:    lfdx f0, r17, r12
-; CHECK-NEXT:    lfdx f1, r18, r12
-; CHECK-NEXT:    add r17, r17, r11
-; CHECK-NEXT:    add r18, r18, r11
+; CHECK-NEXT:    stfdx f0, r19, r9
+; CHECK-NEXT:    lfdx f0, r18, r8
+; CHECK-NEXT:    lfdx f1, r17, r8
+; CHECK-NEXT:    add r18, r18, r12
+; CHECK-NEXT:    add r17, r17, r12
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r21, r7
+; CHECK-NEXT:    lfdx f1, r20, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r21, r7
-; CHECK-NEXT:    lfd f0, 0(r22)
-; CHECK-NEXT:    lfd f1, 0(r23)
+; CHECK-NEXT:    stfdx f0, r20, r9
+; CHECK-NEXT:    lfd f0, 0(r23)
+; CHECK-NEXT:    lfd f1, 0(r22)
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r24, r7
+; CHECK-NEXT:    lfdx f1, r21, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r24, r7
-; CHECK-NEXT:    lfdx f0, r22, r0
-; CHECK-NEXT:    lfdx f1, r23, r0
+; CHECK-NEXT:    stfdx f0, r21, r9
+; CHECK-NEXT:    lfdx f0, r23, r7
+; CHECK-NEXT:    lfdx f1, r22, r7
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r25, r7
+; CHECK-NEXT:    lfdx f1, r24, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r25, r7
-; CHECK-NEXT:    lfdx f0, r22, r12
-; CHECK-NEXT:    lfdx f1, r23, r12
-; CHECK-NEXT:    add r22, r22, r11
-; CHECK-NEXT:    add r23, r23, r11
+; CHECK-NEXT:    stfdx f0, r24, r9
+; CHECK-NEXT:    lfdx f0, r23, r8
+; CHECK-NEXT:    lfdx f1, r22, r8
+; CHECK-NEXT:    add r23, r23, r12
+; CHECK-NEXT:    add r22, r22, r12
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r26, r7
+; CHECK-NEXT:    lfdx f1, r25, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r26, r7
-; CHECK-NEXT:    lfd f0, 0(r27)
-; CHECK-NEXT:    lfd f1, 0(r28)
+; CHECK-NEXT:    stfdx f0, r25, r9
+; CHECK-NEXT:    lfd f0, 0(r28)
+; CHECK-NEXT:    lfd f1, 0(r27)
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r29, r7
+; CHECK-NEXT:    lfdx f1, r26, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r29, r7
-; CHECK-NEXT:    lfdx f0, r27, r0
-; CHECK-NEXT:    lfdx f1, r28, r0
+; CHECK-NEXT:    stfdx f0, r26, r9
+; CHECK-NEXT:    lfdx f0, r28, r7
+; CHECK-NEXT:    lfdx f1, r27, r7
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r8, r7
+; CHECK-NEXT:    lfdx f1, r29, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r8, r7
-; CHECK-NEXT:    lfdx f0, r27, r12
-; CHECK-NEXT:    lfdx f1, r28, r12
-; CHECK-NEXT:    add r27, r27, r11
-; CHECK-NEXT:    add r28, r28, r11
+; CHECK-NEXT:    stfdx f0, r29, r9
+; CHECK-NEXT:    lfdx f0, r28, r8
+; CHECK-NEXT:    lfdx f1, r27, r8
+; CHECK-NEXT:    add r28, r28, r12
+; CHECK-NEXT:    add r27, r27, r12
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r9, r7
+; CHECK-NEXT:    lfdx f1, r30, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r9, r7
-; CHECK-NEXT:    add r7, r7, r11
+; CHECK-NEXT:    stfdx f0, r30, r9
+; CHECK-NEXT:    add r9, r9, r12
 ; CHECK-NEXT:    bdnz .LBB7_3
 ; CHECK-NEXT:  .LBB7_4: # %for.cond.cleanup.loopexit.unr-lcssa
+; CHECK-NEXT:    ld r7, -192(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    cmpldi r6, 0
 ; CHECK-NEXT:    beq cr0, .LBB7_7
 ; CHECK-NEXT:  # %bb.5: # %for.body.epil.preheader
-; CHECK-NEXT:    ld r3, -184(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld r0, -160(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    sldi r8, r30, 3
-; CHECK-NEXT:    add r3, r10, r3
-; CHECK-NEXT:    sldi r3, r3, 3
-; CHECK-NEXT:    add r7, r5, r3
-; CHECK-NEXT:    add r9, r4, r3
-; CHECK-NEXT:    add r11, r0, r3
-; CHECK-NEXT:    ld r3, -176(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    add r3, r10, r3
-; CHECK-NEXT:    sldi r3, r3, 3
-; CHECK-NEXT:    add r12, r5, r3
-; CHECK-NEXT:    add r30, r4, r3
-; CHECK-NEXT:    add r29, r0, r3
-; CHECK-NEXT:    ld r3, -168(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    add r3, r10, r3
-; CHECK-NEXT:    li r10, 0
-; CHECK-NEXT:    sldi r3, r3, 3
-; CHECK-NEXT:    add r5, r5, r3
-; CHECK-NEXT:    add r4, r4, r3
-; CHECK-NEXT:    add r3, r0, r3
+; CHECK-NEXT:    ld r4, -184(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld r29, -160(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    mr r30, r3
+; CHECK-NEXT:    sldi r7, r7, 3
+; CHECK-NEXT:    add r4, r0, r4
+; CHECK-NEXT:    sldi r4, r4, 3
+; CHECK-NEXT:    add r3, r5, r4
+; CHECK-NEXT:    add r8, r29, r4
+; CHECK-NEXT:    add r9, r30, r4
+; CHECK-NEXT:    ld r4, -176(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    add r4, r0, r4
+; CHECK-NEXT:    sldi r4, r4, 3
+; CHECK-NEXT:    add r10, r5, r4
+; CHECK-NEXT:    add r11, r29, r4
+; CHECK-NEXT:    add r12, r30, r4
+; CHECK-NEXT:    ld r4, -168(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    add r4, r0, r4
+; CHECK-NEXT:    sldi r0, r4, 3
+; CHECK-NEXT:    add r5, r5, r0
+; CHECK-NEXT:    add r4, r29, r0
+; CHECK-NEXT:    add r30, r30, r0
+; CHECK-NEXT:    li r0, 0
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB7_6: # %for.body.epil
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    lfdx f0, r3, r10
-; CHECK-NEXT:    lfdx f1, r4, r10
+; CHECK-NEXT:    lfdx f0, r30, r0
+; CHECK-NEXT:    lfdx f1, r4, r0
 ; CHECK-NEXT:    addi r6, r6, -1
 ; CHECK-NEXT:    cmpldi r6, 0
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
 ; CHECK-NEXT:    lfd f1, 0(r5)
 ; CHECK-NEXT:    xsadddp f0, f1, f0
 ; CHECK-NEXT:    stfd f0, 0(r5)
-; CHECK-NEXT:    add r5, r5, r8
-; CHECK-NEXT:    lfdx f0, r29, r10
-; CHECK-NEXT:    lfdx f1, r30, r10
+; CHECK-NEXT:    add r5, r5, r7
+; CHECK-NEXT:    lfdx f0, r12, r0
+; CHECK-NEXT:    lfdx f1, r11, r0
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r12, r10
+; CHECK-NEXT:    lfdx f1, r10, r0
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r12, r10
-; CHECK-NEXT:    lfdx f0, r11, r10
-; CHECK-NEXT:    lfdx f1, r9, r10
+; CHECK-NEXT:    stfdx f0, r10, r0
+; CHECK-NEXT:    lfdx f0, r9, r0
+; CHECK-NEXT:    lfdx f1, r8, r0
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r7, r10
+; CHECK-NEXT:    lfdx f1, r3, r0
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r7, r10
-; CHECK-NEXT:    add r10, r10, r8
+; CHECK-NEXT:    stfdx f0, r3, r0
+; CHECK-NEXT:    add r0, r0, r7
 ; CHECK-NEXT:    bne cr0, .LBB7_6
 ; CHECK-NEXT:  .LBB7_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ld r2, -152(r1) # 8-byte Folded Reload
diff --git a/llvm/test/Transforms/LICM/hoist-binop.ll b/llvm/test/Transforms/LICM/hoist-binop.ll
new file mode 100644
index 0000000000000..8bda74eec34a0
--- /dev/null
+++ b/llvm/test/Transforms/LICM/hoist-binop.ll
@@ -0,0 +1,226 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=licm < %s | FileCheck %s
+
+; Fold ADD and remove old op if unused.
+define void @add_one_use(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_one_use(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add i64 %index, %c1
+  %index.next = add i64 %step.add, %c2
+  br label %loop
+}
+
+; Fold ADD and copy NUW if both ops have it.
+; https://alive2.llvm.org/ce/z/bPAT7Z
+define void @add_nuw(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_nuw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add nuw i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add nuw i64 [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add nuw i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add nuw i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = add nuw i64 %step.add, %c2
+  br label %loop
+}
+
+; Fold ADD but don't copy NUW if only one op has it.
+define void @add_no_nuw(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_no_nuw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add i64 [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = add nuw i64 %step.add, %c2
+  br label %loop
+}
+
+; Fold ADD but don't copy NSW if one op has it.
+define void @add_no_nsw(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_no_nsw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add i64 [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = add nsw i64 %step.add, %c2
+  br label %loop
+}
+
+; Fold ADD but don't copy NSW even if both ops have it.
+define void @add_no_nsw_2(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_no_nsw_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add nsw i64 [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add nsw i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = add nsw i64 %step.add, %c2
+  br label %loop
+}
+
+; Don't fold if the ops are different (even if they are both associative).
+define void @diff_ops(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @diff_ops(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add i64 [[INDEX]], [[C1:%.*]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = mul i64 [[STEP_ADD]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = mul i64 %step.add, %c2
+  br label %loop
+}
+
+; Don't fold if the ops are not associative.
+define void @noassoc_ops(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @noassoc_ops(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = sub i64 [[INDEX]], [[C1:%.*]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = sub i64 [[STEP_ADD]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = sub i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = sub i64 %step.add, %c2
+  br label %loop
+}
+
+; Don't fold floating-point ops, even if they are associative. This would be
+; valid, but is currently disabled.
+define void @fadd(float %c1, float %c2) {
+; CHECK-LABEL: @fadd(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = fadd fast float [[INDEX]], [[C1:%.*]]
+; CHECK-NEXT:    call void @use(float [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = fadd fast float [[STEP_ADD]], [[C2:%.*]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi float [ 0., %entry ], [ %index.next, %loop ]
+  %step.add = fadd fast float %index, %c1
+  call void @use(float %step.add)
+  %index.next = fadd fast float %step.add, %c2
+  br label %loop
+}
+
+; Original reproducer, adapted from:
+;   for(long i = 0; i < n; ++i)
+;     a[i] = (i*k) * v;
+define void @test(i64 %n, i64 %k) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[K_2:%.*]] = shl nuw nsw i64 [[K:%.*]], 1
+; CHECK-NEXT:    [[VEC_INIT:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[K]], i64 1
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[K_2]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add <2 x i64> [[DOTSPLAT]], [[DOTSPLAT]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ [[VEC_INIT]], [[ENTRY:%.*]] ], [ [[VEC_IND_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    call void @use(<2 x i64> [[STEP_ADD]])
+; CHECK-NEXT:    [[VEC_IND_NEXT_REASS]] = add <2 x i64> [[VEC_IND]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  %k.2 = shl nuw nsw i64 %k, 1
+  %vec.init = insertelement <2 x i64> zeroinitializer, i64 %k, i64 1
+  %.splatinsert = insertelement <2 x i64> poison, i64 %k.2, i64 0
+  %.splat = shufflevector <2 x i64> %.splatinsert, <2 x i64> poison, <2 x i32> zeroinitializer
+  br label %loop
+
+loop:
+  %vec.ind = phi <2 x i64> [ %vec.init, %entry ], [ %vec.ind.next, %loop ]
+  %step.add = add <2 x i64> %vec.ind, %.splat
+  call void @use(<2 x i64> %step.add)
+  %vec.ind.next = add <2 x i64> %step.add, %.splat
+  br label %loop
+}
+
+declare void @use()
diff --git a/llvm/test/Transforms/LICM/sink-foldable.ll b/llvm/test/Transforms/LICM/sink-foldable.ll
index 38577a5a12563..36e2eab6313dc 100644
--- a/llvm/test/Transforms/LICM/sink-foldable.ll
+++ b/llvm/test/Transforms/LICM/sink-foldable.ll
@@ -77,9 +77,10 @@ return:
 define ptr @test2(i32 %j, ptr readonly %P, ptr readnone %Q) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add i32 1, 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond:
-; CHECK-NEXT:    [[I_ADDR_0:%.*]] = phi i32 [ [[ADD:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[I_ADDR_0:%.*]] = phi i32 [ [[ADD_REASS:%.*]], [[IF_END:%.*]] ]
 ; CHECK-NEXT:    [[P_ADDR_0:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[IF_END]] ]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_ADDR_0]], [[J:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[LOOPEXIT0:%.*]]
@@ -97,7 +98,7 @@ define ptr @test2(i32 %j, ptr readonly %P, ptr readnone %Q) {
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[ADD_PTR]], i64 [[IDX2_EXT]]
 ; CHECK-NEXT:    [[L1:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt ptr [[L1]], [[Q]]
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[ADD_I]], 1
+; CHECK-NEXT:    [[ADD_REASS]] = add i32 [[I_ADDR]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[LOOPEXIT2:%.*]], label [[FOR_COND]]
 ; CHECK:       loopexit0:
 ; CHECK-NEXT:    [[P0:%.*]] = phi ptr [ null, [[FOR_COND]] ]

>From 35dfe8013c2e100ffa159ca183430422a370d3a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Thu, 25 Jul 2024 19:58:56 +0200
Subject: [PATCH 253/259] [clang][Interp][NFC] Don't try to access empty
 EvaluationResult

---
 clang/lib/AST/Interp/InterpBuiltin.cpp | 37 ++++++++++++++------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index c59bbc8313edc..d7538c76e91d9 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -1170,27 +1170,30 @@ static bool interp__builtin_constant_p(InterpState &S, CodePtr OpPC,
       Stk.clear();
     }
 
-    const APValue &LV = Res.toAPValue();
-    if (!Res.isInvalid() && LV.isLValue()) {
-      APValue::LValueBase Base = LV.getLValueBase();
-      if (Base.isNull()) {
-        // A null base is acceptable.
-        return returnInt(true);
-      } else if (const auto *E = Base.dyn_cast<const Expr *>()) {
-        if (!isa<StringLiteral>(E))
+    if (!Res.isInvalid() && !Res.empty()) {
+      const APValue &LV = Res.toAPValue();
+      if (LV.isLValue()) {
+        APValue::LValueBase Base = LV.getLValueBase();
+        if (Base.isNull()) {
+          // A null base is acceptable.
+          return returnInt(true);
+        } else if (const auto *E = Base.dyn_cast<const Expr *>()) {
+          if (!isa<StringLiteral>(E))
+            return returnInt(false);
+          return returnInt(LV.getLValueOffset().isZero());
+        } else if (Base.is<TypeInfoLValue>()) {
+          // Surprisingly, GCC considers __builtin_constant_p(&typeid(int)) to
+          // evaluate to true.
+          return returnInt(true);
+        } else {
+          // Any other base is not constant enough for GCC.
           return returnInt(false);
-        return returnInt(LV.getLValueOffset().isZero());
-      } else if (Base.is<TypeInfoLValue>()) {
-        // Surprisingly, GCC considers __builtin_constant_p(&typeid(int)) to
-        // evaluate to true.
-        return returnInt(true);
-      } else {
-        // Any other base is not constant enough for GCC.
-        return returnInt(false);
+        }
       }
     }
 
-    return returnInt(!Res.isInvalid() && !Res.empty());
+    // Otherwise, any constant value is good enough.
+    return returnInt(true);
   }
 
   return returnInt(false);

>From d65f0375910f6dfeaa94c1060e7d81fb7bdf76be Mon Sep 17 00:00:00 2001
From: Vikash Gupta <Vikash.Gupta at amd.com>
Date: Fri, 26 Jul 2024 15:24:06 +0530
Subject: [PATCH 254/259] [Clang] Use private address space for builtin_alloca
 return type for OpenCL (#95750)

The __builtin_alloca was returning a flat pointer with no address space
when compiled using openCL1.2 or below but worked fine with openCL2.0
and above. This accounts to the fact that later uses the concept of
generic address space which supports cast to other address space(i.e to
private address space which is used for stack allocation) .

But, in actuality, as it returns pointer to the stack, it should be
pointing to private address space irrespective of openCL version becuase
builtin_alloca allocates stack memory used for current function in which
it is called. Thus,it requires redefintion of the builtin function with
appropraite return pointer to private address space.
---
 clang/lib/Sema/SemaChecking.cpp             |  15 +++
 clang/test/CodeGenOpenCL/builtins-alloca.cl | 141 ++++++++++++++++++++
 2 files changed, 156 insertions(+)
 create mode 100644 clang/test/CodeGenOpenCL/builtins-alloca.cl

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 07a34fda494d8..bb30b1e289a1c 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -1477,6 +1477,18 @@ static bool BuiltinSEHScopeCheck(Sema &SemaRef, CallExpr *TheCall,
   return false;
 }
 
+// In OpenCL, __builtin_alloca_* should return a pointer to address space
+// that corresponds to the stack address space i.e private address space.
+static void builtinAllocaAddrSpace(Sema &S, CallExpr *TheCall) {
+  QualType RT = TheCall->getType();
+  assert((RT->isPointerType() && !(RT->getPointeeType().hasAddressSpace())) &&
+         "__builtin_alloca has invalid address space");
+
+  RT = RT->getPointeeType();
+  RT = S.Context.getAddrSpaceQualType(RT, LangAS::opencl_private);
+  TheCall->setType(S.Context.getPointerType(RT));
+}
+
 namespace {
 enum PointerAuthOpKind {
   PAO_Strip,
@@ -2214,6 +2226,9 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
   case Builtin::BI__builtin_alloca_uninitialized:
     Diag(TheCall->getBeginLoc(), diag::warn_alloca)
         << TheCall->getDirectCallee();
+    if (getLangOpts().OpenCL) {
+      builtinAllocaAddrSpace(*this, TheCall);
+    }
     break;
   case Builtin::BI__arithmetic_fence:
     if (BuiltinArithmeticFence(TheCall))
diff --git a/clang/test/CodeGenOpenCL/builtins-alloca.cl b/clang/test/CodeGenOpenCL/builtins-alloca.cl
new file mode 100644
index 0000000000000..474e95e74e006
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-alloca.cl
@@ -0,0 +1,141 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL1.2 \
+// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL2.0 \
+// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL3.0 \
+// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL3.0 -cl-ext=+__opencl_c_generic_address_space \
+// RUN:     -emit-llvm -o - | FileCheck --check-prefixes=OPENCL %s
+
+// OPENCL-LABEL: define dso_local void @test1_builtin_alloca(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test1_builtin_alloca(unsigned n) {
+    __private float* alloc_ptr = (__private float*)__builtin_alloca(n*sizeof(int));
+}
+
+// OPENCL-LABEL: define dso_local void @test1_builtin_alloca_uninitialized(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test1_builtin_alloca_uninitialized(unsigned n) {
+    __private float* alloc_ptr_uninitialized = (__private float*)__builtin_alloca_uninitialized(n*sizeof(int));
+}
+
+// OPENCL-LABEL: define dso_local void @test1_builtin_alloca_with_align(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test1_builtin_alloca_with_align(unsigned n) {
+    __private float* alloc_ptr_align = (__private float*)__builtin_alloca_with_align((n*sizeof(int)), 8);
+}
+
+// OPENCL-LABEL: define dso_local void @test1_builtin_alloca_with_align_uninitialized(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[MUL:%.*]] = mul i64 [[CONV]], 4
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[MUL]], align 1, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test1_builtin_alloca_with_align_uninitialized(unsigned n) {
+    __private float* alloc_ptr_align_uninitialized = (__private float*)__builtin_alloca_with_align_uninitialized((n*sizeof(int)), 8);
+}
+
+// OPENCL-LABEL: define dso_local void @test2_builtin_alloca(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test2_builtin_alloca(unsigned n) {
+    __private void *alloc_ptr = __builtin_alloca(n);
+}
+
+// OPENCL-LABEL: define dso_local void @test2_builtin_alloca_uninitialized(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 8, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_UNINITIALIZED]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test2_builtin_alloca_uninitialized(unsigned n) {
+    __private void *alloc_ptr_uninitialized = __builtin_alloca_uninitialized(n);
+}
+
+// OPENCL-LABEL: define dso_local void @test2_builtin_alloca_with_align(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR_ALIGN:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test2_builtin_alloca_with_align(unsigned n) {
+    __private void *alloc_ptr_align = __builtin_alloca_with_align(n, 8);
+}
+
+// OPENCL-LABEL: define dso_local void @test2_builtin_alloca_with_align_uninitialized(
+// OPENCL-SAME: i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// OPENCL-NEXT:  [[ENTRY:.*:]]
+// OPENCL-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// OPENCL-NEXT:    [[ALLOC_PTR_ALIGN_UNINITIALIZED:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// OPENCL-NEXT:    store i32 [[N]], ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4
+// OPENCL-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
+// OPENCL-NEXT:    [[TMP1:%.*]] = alloca i8, i64 [[CONV]], align 1, addrspace(5)
+// OPENCL-NEXT:    store ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[ALLOC_PTR_ALIGN_UNINITIALIZED]], align 4
+// OPENCL-NEXT:    ret void
+//
+void test2_builtin_alloca_with_align_uninitialized(unsigned n) {
+    __private void *alloc_ptr_align_uninitialized = __builtin_alloca_with_align_uninitialized(n, 8);
+}

>From 88549cf47cafc4c4a6042393ee07fc2dc20566cc Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav at gmail.com>
Date: Fri, 26 Jul 2024 13:09:26 +0300
Subject: [PATCH 255/259] [clang-doc] Disable flaky test `basic-project.test`

---
 clang-tools-extra/test/clang-doc/basic-project.test | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang-tools-extra/test/clang-doc/basic-project.test b/clang-tools-extra/test/clang-doc/basic-project.test
index 51d3ac6ce6dcd..38569d824f1f0 100644
--- a/clang-tools-extra/test/clang-doc/basic-project.test
+++ b/clang-tools-extra/test/clang-doc/basic-project.test
@@ -1,3 +1,6 @@
+// See https://github.com/llvm/llvm-project/issues/97507.
+// UNSUPPORTED: target={{.*}}
+
 // RUN: rm -rf %t && mkdir -p %t/docs %t/build
 // RUN: sed 's|$test_dir|%/S|g' %S/Inputs/basic-project/database_template.json > %t/build/compile_commands.json
 // RUN: clang-doc --format=html --output=%t/docs --executor=all-TUs %t/build/compile_commands.json

>From 8b02f31aea6b95db70273df2ebd9f8c533a62cc1 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 26 Jul 2024 11:16:44 +0100
Subject: [PATCH 256/259] [VPlan] Consistently use VF.Width to getting plan for
 main loop VF (NFC)

Cleanup to make things consistent in preparation for
https://github.com/llvm/llvm-project/pull/98821.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 09ca859f52680..ea19af10a1474 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -10027,8 +10027,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
                                            EPI, &LVL, &CM, BFI, PSI, Checks);
 
+        assert(EPI.MainLoopVF == VF.Width && "VFs must match");
         std::unique_ptr<VPlan> BestMainPlan(
-            LVP.getBestPlanFor(EPI.MainLoopVF).duplicate());
+            LVP.getBestPlanFor(VF.Width).duplicate());
         const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
             EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
         ++LoopsVectorized;

>From 94e966255fb4309f24805273d2e17f63a4e36d17 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 26 Jul 2024 10:59:09 +0100
Subject: [PATCH 257/259] [X86] skylake only uses Port0 for (v)phminposuw
 instructions

Now matches skylake-server - and matches reports from uops.info, Agner and instlatx64.
---
 llvm/lib/Target/X86/X86SchedSkylakeClient.td           |  2 +-
 .../tools/llvm-mca/X86/SkylakeClient/resources-avx1.s  | 10 +++++-----
 .../tools/llvm-mca/X86/SkylakeClient/resources-sse41.s | 10 +++++-----
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 3ee931fe5ed9c..d764cb3fe3130 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -404,7 +404,7 @@ defm : SKLWriteResPair<WritePSADBW,  [SKLPort5], 3, [1], 1, 5>; // Vector PSADBW
 defm : SKLWriteResPair<WritePSADBWX, [SKLPort5], 3, [1], 1, 6>;
 defm : SKLWriteResPair<WritePSADBWY, [SKLPort5], 3, [1], 1, 7>;
 defm : X86WriteResPairUnsupported<WritePSADBWZ>;
-defm : SKLWriteResPair<WritePHMINPOS, [SKLPort01], 4, [1], 1, 6>; // Vector PHMINPOS.
+defm : SKLWriteResPair<WritePHMINPOS, [SKLPort0], 4, [1], 1, 6>; // Vector PHMINPOS.
 
 // Vector integer shifts.
 defm : SKLWriteResPair<WriteVecShift,     [SKLPort0], 1, [1], 1, 5>;
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
index ef5a9e34a932b..2711bc4579b00 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
@@ -1461,8 +1461,8 @@ vzeroupper
 # CHECK-NEXT:  4      9     2.00    *                   vphaddsw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  3      3     2.00                        vphaddw	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  4      9     2.00    *                   vphaddw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      4     0.50                        vphminposuw	%xmm0, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   vphminposuw	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        vphminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  2      10    1.00    *                   vphminposuw	(%rax), %xmm2
 # CHECK-NEXT:  3      3     2.00                        vphsubd	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  4      9     2.00    *                   vphsubd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  3      3     2.00                        vphsubsw	%xmm0, %xmm1, %xmm2
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     126.00 339.58 199.58 173.83 173.83 38.00  326.58 6.25   11.33
+# CHECK-NEXT:  -     126.00 340.58 198.58 173.83 173.83 38.00  326.58 6.25   11.33
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -2171,8 +2171,8 @@ vzeroupper
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -     2.00    -      -     vphaddsw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.33   0.33    -      -      -     2.33    -      -     vphaddw	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     2.33    -      -     vphaddw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vphminposuw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vphminposuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vphminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vphminposuw	(%rax), %xmm2
 # CHECK-NEXT:  -      -     0.33   0.33    -      -      -     2.33    -      -     vphsubd	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     2.33    -      -     vphsubd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -     2.00    -      -     vphsubsw	%xmm0, %xmm1, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
index 1d8d67fd323f1..0eec7e4cf58e8 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
@@ -189,8 +189,8 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  2      3     1.00                        pextrq	$1, %xmm0, %rcx
 # CHECK-NEXT:  3      2     1.00           *            pextrq	$1, %xmm0, (%rax)
 # CHECK-NEXT:  3      2     1.00           *            pextrw	$1, %xmm0, (%rax)
-# CHECK-NEXT:  1      4     0.50                        phminposuw	%xmm0, %xmm2
-# CHECK-NEXT:  2      10    0.50    *                   phminposuw	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        phminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  2      10    1.00    *                   phminposuw	(%rax), %xmm2
 # CHECK-NEXT:  2      2     2.00                        pinsrb	$1, %eax, %xmm1
 # CHECK-NEXT:  2      6     1.00    *                   pinsrb	$1, (%rax), %xmm1
 # CHECK-NEXT:  2      2     2.00                        pinsrd	$1, %eax, %xmm1
@@ -266,7 +266,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     37.83  31.33  23.67  23.67  5.00   63.33  0.50   1.67
+# CHECK-NEXT:  -      -     38.83  30.33  23.67  23.67  5.00   63.33  0.50   1.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -304,8 +304,8 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     pextrq	$1, %xmm0, %rcx
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00   1.00    -     0.33   pextrq	$1, %xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00   1.00    -     0.33   pextrw	$1, %xmm0, (%rax)
-# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     phminposuw	%xmm0, %xmm2
-# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     phminposuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     phminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     phminposuw	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     pinsrb	$1, %eax, %xmm1
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -     pinsrb	$1, (%rax), %xmm1
 # CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     pinsrd	$1, %eax, %xmm1

>From 8d4cb92e3536e86b177c354c9c5cf6f06bd61c8d Mon Sep 17 00:00:00 2001
From: yronglin <yronglin777 at gmail.com>
Date: Fri, 26 Jul 2024 19:34:27 +0800
Subject: [PATCH 258/259] [NFC][Clang][Interp] Add more test for
 `__builtin_os_log_format_buffer_size` (#100566)

Signed-off-by: yronglin <yronglin777 at gmail.com>
---
 clang/test/AST/Interp/builtins.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/clang/test/AST/Interp/builtins.cpp b/clang/test/AST/Interp/builtins.cpp
index a74b68bb9d89b..9b2b20773be58 100644
--- a/clang/test/AST/Interp/builtins.cpp
+++ b/clang/test/AST/Interp/builtins.cpp
@@ -31,3 +31,8 @@ constexpr bool assume() {
   return true;
 }
 static_assert(assume(), "");
+
+void test_builtin_os_log(void *buf, int i, const char *data) {
+  constexpr int len = __builtin_os_log_format_buffer_size("%d %{public}s %{private}.16P", i, data, data);
+  static_assert(len > 0, "Expect len > 0");
+}

>From 4e2e236feebf9001aaf799e546e1341adc2c2c1d Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 1 Feb 2023 09:48:43 -0400
Subject: [PATCH 259/259] DAG: Lower is.fpclass fcSubnormal|fcZero to fabs(x) <
 smallest_normal

Produces better code on x86_64 only in the unordered case. Not
sure what the exact condition should be to avoid the regression. Free
fabs might do it, or maybe requires legality checks for the alternative
integer expansion.
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 38 +++++++++++++-
 llvm/test/CodeGen/X86/is_fpclass.ll           | 52 +++++++++----------
 2 files changed, 60 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 140c97ccd90ba..6fd23b5ab9f5f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8556,11 +8556,12 @@ static std::optional<bool> isFCmpEqualZero(FPClassTest Test,
 }
 
 SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
-                                         FPClassTest Test, SDNodeFlags Flags,
-                                         const SDLoc &DL,
+                                         const FPClassTest OrigTestMask,
+                                         SDNodeFlags Flags, const SDLoc &DL,
                                          SelectionDAG &DAG) const {
   EVT OperandVT = Op.getValueType();
   assert(OperandVT.isFloatingPoint());
+  FPClassTest Test = OrigTestMask;
 
   // Degenerated cases.
   if (Test == fcNone)
@@ -8594,9 +8595,21 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
   // exceptions are ignored.
   if (Flags.hasNoFPExcept() &&
       isOperationLegalOrCustom(ISD::SETCC, OperandVT.getScalarType())) {
+    FPClassTest FPTestMask = Test;
+
     ISD::CondCode OrderedCmpOpcode = IsInverted ? ISD::SETUNE : ISD::SETOEQ;
     ISD::CondCode UnorderedCmpOpcode = IsInverted ? ISD::SETONE : ISD::SETUEQ;
 
+    // See if we can fold an | fcNan into an unordered compare.
+    FPClassTest OrderedFPTestMask = FPTestMask & ~fcNan;
+
+    // Can't fold the ordered check if we're only testing for snan or qnan
+    // individually.
+    if ((FPTestMask & fcNan) != fcNan)
+      OrderedFPTestMask = FPTestMask;
+
+    const bool IsOrdered = FPTestMask == OrderedFPTestMask;
+
     if (std::optional<bool> IsCmp0 =
             isFCmpEqualZero(Test, Semantics, DAG.getMachineFunction());
         IsCmp0 && (isCondCodeLegalOrCustom(
@@ -8628,6 +8641,27 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
       return DAG.getSetCC(DL, ResultVT, Abs, Inf,
                           IsInverted ? ISD::SETUNE : ISD::SETOEQ);
     }
+
+    if (OrderedFPTestMask == (fcSubnormal | fcZero) && !IsOrdered) {
+      // TODO: Could handle ordered case, but it produces worse code for
+      // x86. Maybe handle ordered if fabs is free?
+
+      ISD::CondCode OrderedOp = IsInverted ? ISD::SETUGE : ISD::SETOLT;
+      ISD::CondCode UnorderedOp = IsInverted ? ISD::SETOGE : ISD::SETULT;
+
+      if (isCondCodeLegalOrCustom(IsOrdered ? OrderedOp : UnorderedOp,
+                                  OperandVT.getScalarType().getSimpleVT())) {
+        // (issubnormal(x) || iszero(x)) --> fabs(x) < smallest_normal
+
+        // TODO: Maybe only makes sense if fabs is free. Integer test of
+        // exponent bits seems better for x86.
+        SDValue Abs = DAG.getNode(ISD::FABS, DL, OperandVT, Op);
+        SDValue SmallestNormal = DAG.getConstantFP(
+            APFloat::getSmallestNormalized(Semantics), DL, OperandVT);
+        return DAG.getSetCC(DL, ResultVT, Abs, SmallestNormal,
+                            IsOrdered ? OrderedOp : UnorderedOp);
+      }
+    }
   }
 
   // In the general case use integer operations.
diff --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll
index 532b2c09a9175..cc4d4c4543a51 100644
--- a/llvm/test/CodeGen/X86/is_fpclass.ll
+++ b/llvm/test/CodeGen/X86/is_fpclass.ll
@@ -2602,24 +2602,22 @@ define i1 @issubnormal_or_nan_f(float %x) {
 define i1 @issubnormal_or_zero_or_nan_f(float %x) {
 ; X86-LABEL: issubnormal_or_zero_or_nan_f:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; X86-NEXT:    sete %cl
-; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X86-NEXT:    setge %al
-; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fabs
+; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NEXT:    fxch %st(1)
+; X86-NEXT:    fucompp
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: issubnormal_or_zero_or_nan_f:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    sete %cl
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X64-NEXT:    setge %al
-; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    setb %al
 ; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 243)  ; 0xf0|0x3 = "subnormal|zero|nan"
   ret i1 %class
@@ -2773,24 +2771,22 @@ define i1 @not_issubnormal_or_nan_f(float %x) {
 define i1 @not_issubnormal_or_zero_or_nan_f(float %x) {
 ; X86-LABEL: not_issubnormal_or_zero_or_nan_f:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; X86-NEXT:    setne %cl
-; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X86-NEXT:    setl %al
-; X86-NEXT:    andb %cl, %al
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fabs
+; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NEXT:    fxch %st(1)
+; X86-NEXT:    fucompp
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setae %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: not_issubnormal_or_zero_or_nan_f:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    setne %cl
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X64-NEXT:    setl %al
-; X64-NEXT:    andb %cl, %al
+; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    setae %al
 ; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 780)  ; ~(0xf0|0x3) = ~"subnormal|zero|nan"
   ret i1 %class